From acc7aee86881b3c5c16bd525a10c971f42a2fb87 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 18 Jul 2023 17:32:45 +0200 Subject: [PATCH 01/16] Update energy mix version in country_profile dataset --- dag/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dag/main.yml b/dag/main.yml index 9d2d3d9a870..619e5a72c10 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -215,7 +215,7 @@ steps: - data://garden/regions/2023-01-01/regions - data://garden/gcp/2023-04-28/global_carbon_budget - data://garden/democracy/2023-03-02/vdem - - data://garden/bp/2023-02-20/energy_mix + - data://garden/energy/2023-07-10/energy_mix - data://garden/worldbank_wdi/2022-05-26/wdi # Global GDP in the long run From eb28df04dd35f14f66555645aa926d9527b775b5 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 09:50:43 +0200 Subject: [PATCH 02/16] Duplicate GCP and emissions datasets --- .../emissions/2023-07-10/owid_co2.meta.yml | 12 + .../garden/emissions/2023-07-10/owid_co2.py | 424 ++++++++ .../global_carbon_budget.countries.json | 278 ++++++ ...obal_carbon_budget.excluded_countries.json | 4 + .../2023-07-10/global_carbon_budget.meta.yml | 483 +++++++++ .../gcp/2023-07-10/global_carbon_budget.py | 945 ++++++++++++++++++ .../gcp/2023-07-10/global_carbon_budget.py | 81 ++ .../gcp/2023-07-10/global_carbon_budget.py | 238 +++++ 8 files changed, 2465 insertions(+) create mode 100644 etl/steps/data/garden/emissions/2023-07-10/owid_co2.meta.yml create mode 100644 etl/steps/data/garden/emissions/2023-07-10/owid_co2.py create mode 100644 etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.countries.json create mode 100644 etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.excluded_countries.json create mode 100644 etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml create mode 100644 etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.py create mode 100644 etl/steps/data/grapher/gcp/2023-07-10/global_carbon_budget.py create mode 100644 etl/steps/data/meadow/gcp/2023-07-10/global_carbon_budget.py diff --git a/etl/steps/data/garden/emissions/2023-07-10/owid_co2.meta.yml b/etl/steps/data/garden/emissions/2023-07-10/owid_co2.meta.yml new file mode 100644 index 00000000000..080d012dcff --- /dev/null +++ b/etl/steps/data/garden/emissions/2023-07-10/owid_co2.meta.yml @@ -0,0 +1,12 @@ +dataset: + title: OWID CO2 dataset (2023) + description: | + OWID CO2 dataset. + + This dataset will be loaded by the co2-data repository, to create a csv file of the dataset that can be downloaded in one click. + +# Dataset sources will be created in the step by combining all component datasets' sources. +# Also, table metadata will be built from the tables' original metadata. + +tables: + {} diff --git a/etl/steps/data/garden/emissions/2023-07-10/owid_co2.py b/etl/steps/data/garden/emissions/2023-07-10/owid_co2.py new file mode 100644 index 00000000000..52371f4a84d --- /dev/null +++ b/etl/steps/data/garden/emissions/2023-07-10/owid_co2.py @@ -0,0 +1,424 @@ +"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset (2022). + +Datasets combined: +* Global Carbon Budget (Global Carbon Project, 2022). +* National contributions to climate change (Jones et al. (2023), 2023). +* Greenhouse gas emissions by sector (CAIT, 2022). +* Primary energy consumption (BP & EIA, 2022) + +Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2020) on +GDP are included. + +""" + +from typing import List + +import numpy as np +import pandas as pd +from owid import catalog +from owid.datautils import dataframes + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Details for dataset to export. +DATASET_SHORT_NAME = "owid_co2" +DATASET_TITLE = "CO2 dataset (OWID, 2022)" + +# Conversion factor from tonnes to million tonnes. +TONNES_TO_MILLION_TONNES = 1e-6 + +# Select columns to use from each dataset, and how to rename them. +GCP_COLUMNS = { + "country": "country", + "year": "year", + "emissions_total": "co2", + "emissions_total_per_capita": "co2_per_capita", + "traded_emissions": "trade_co2", + "emissions_from_cement": "cement_co2", + "emissions_from_cement_per_capita": "cement_co2_per_capita", + "emissions_from_coal": "coal_co2", + "emissions_from_coal_per_capita": "coal_co2_per_capita", + "emissions_from_flaring": "flaring_co2", + "emissions_from_flaring_per_capita": "flaring_co2_per_capita", + "emissions_from_gas": "gas_co2", + "emissions_from_gas_per_capita": "gas_co2_per_capita", + "emissions_from_oil": "oil_co2", + "emissions_from_oil_per_capita": "oil_co2_per_capita", + "emissions_from_other_industry": "other_industry_co2", + "emissions_from_other_industry_per_capita": "other_co2_per_capita", + "pct_growth_emissions_total": "co2_growth_prct", + "growth_emissions_total": "co2_growth_abs", + "emissions_total_per_gdp": "co2_per_gdp", + "emissions_total_per_unit_energy": "co2_per_unit_energy", + "consumption_emissions": "consumption_co2", + "consumption_emissions_per_capita": "consumption_co2_per_capita", + "consumption_emissions_per_gdp": "consumption_co2_per_gdp", + "cumulative_emissions_total": "cumulative_co2", + "cumulative_emissions_from_cement": "cumulative_cement_co2", + "cumulative_emissions_from_coal": "cumulative_coal_co2", + "cumulative_emissions_from_flaring": "cumulative_flaring_co2", + "cumulative_emissions_from_gas": "cumulative_gas_co2", + "cumulative_emissions_from_oil": "cumulative_oil_co2", + "cumulative_emissions_from_other_industry": "cumulative_other_co2", + "pct_traded_emissions": "trade_co2_share", + "emissions_total_as_share_of_global": "share_global_co2", + "emissions_from_cement_as_share_of_global": "share_global_cement_co2", + "emissions_from_coal_as_share_of_global": "share_global_coal_co2", + "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2", + "emissions_from_gas_as_share_of_global": "share_global_gas_co2", + "emissions_from_oil_as_share_of_global": "share_global_oil_co2", + "emissions_from_other_industry_as_share_of_global": "share_global_other_co2", + "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2", + "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2", + "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2", + "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2", + "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2", + "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2", + "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2", + # New variables, related to land-use change emissions. + "cumulative_emissions_from_land_use_change": "cumulative_luc_co2", + "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2", + "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc", + "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc", + "emissions_from_land_use_change": "land_use_change_co2", + "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2", + "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita", + "emissions_total_including_land_use_change": "co2_including_luc", + "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc", + "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita", + "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp", + "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy", + "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs", + "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct", +} +JONES_COLUMNS = { + "country": "country", + "year": "year", + "temperature_response_co2_total": "temperature_change_from_co2", + "temperature_response_ghg_total": "temperature_change_from_ghg", + "temperature_response_ch4_total": "temperature_change_from_ch4", + "temperature_response_n2o_total": "temperature_change_from_n2o", + "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg", +} +CAIT_GHG_COLUMNS = { + "country": "country", + "year": "year", + "total_excluding_lucf": "total_ghg_excluding_lucf", + "total_excluding_lucf__per_capita": "ghg_excluding_lucf_per_capita", + "total_including_lucf": "total_ghg", + "total_including_lucf__per_capita": "ghg_per_capita", +} +CAIT_CH4_COLUMNS = { + "country": "country", + "year": "year", + "total_including_lucf": "methane", + "total_including_lucf__per_capita": "methane_per_capita", +} +CAIT_N2O_COLUMNS = { + "country": "country", + "year": "year", + "total_including_lucf": "nitrous_oxide", + "total_including_lucf__per_capita": "nitrous_oxide_per_capita", +} +PRIMARY_ENERGY_COLUMNS = { + "country": "country", + "year": "year", + "primary_energy_consumption__twh": "primary_energy_consumption", + "primary_energy_consumption_per_capita__kwh": "energy_per_capita", + "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp", +} +REGIONS_COLUMNS = { + "name": "country", + "iso_alpha3": "iso_code", +} +POPULATION_COLUMNS = { + "country": "country", + "year": "year", + "population": "population", +} +GDP_COLUMNS = { + "country": "country", + "year": "year", + "gdp": "gdp", +} + +UNITS = {"tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes"}} + + +def unique_sources_from_datasets( + datasets: List[catalog.Dataset], +) -> List[catalog.meta.Source]: + """Gather unique sources from datasets. + + Note: To check if a source is already listed, only the name of the source is considered (not the description or any + other field in the source). + + Parameters + ---------- + datasets : list + List of datasets with metadata. + + Returns + ------- + known_sources : list + List of unique sources from all datasets. + + """ + # Initialise list that will gather all unique metadata sources from the tables. + known_sources: List[catalog.meta.Source] = [] + for ds in datasets: + # Get list of sources of the dataset of current table. + table_sources = ds.metadata.sources + # Go source by source of current table, and check if its name is not already in the list of known_sources. + for source in table_sources: + # Check if this source's name is different to all known_sources. + if all([source.name != known_source.name for known_source in known_sources]): + # Add the new source to the list. + known_sources.append(source) + + return known_sources + + +def convert_units(table: catalog.Table) -> catalog.Table: + """Convert units of table. + + Parameters + ---------- + table : catalog.Table + Data with its original units. + + Returns + ------- + catalog.Table + Data after converting units of specific columns. + + """ + table = table.copy() + # Check units and convert to more convenient ones. + for column in table.columns: + unit = table[column].metadata.unit + short_unit = table[column].metadata.short_unit + title = table[column].metadata.title + description = table[column].metadata.description + if unit in list(UNITS): + table[column] *= UNITS[unit]["conversion"] + table[column].metadata.unit = unit + table[column].metadata.short_unit = short_unit + table[column].metadata.title = title + table[column].metadata.description = description.replace(unit, UNITS[unit]["new_unit"]) + + return table + + +def combine_tables( + tb_gcp: catalog.Table, + tb_jones: catalog.Table, + tb_cait_ghg: catalog.Table, + tb_cait_ch4: catalog.Table, + tb_cait_n2o: catalog.Table, + tb_energy: catalog.Table, + tb_gdp: catalog.Table, + tb_population: catalog.Table, + tb_regions: catalog.Table, +) -> catalog.Table: + """Combine tables. + + Parameters + ---------- + tb_gcp : catalog.Table + Global Carbon Budget table (from Global Carbon Project). + tb_jones : catalog.Table + National contributions to climate change (from Jones et al. (2023)). + tb_cait_ghg : catalog.Table + Greenhouse gas emissions table (from CAIT). + tb_cait_ch4 : catalog.Table + CH4 emissions table (from CAIT). + tb_cait_n2o : catalog.Table + N2O emissions table (from CAIT). + tb_energy : catalog.Table + Primary energy consumption table (from BP & EIA). + tb_gdp : catalog.Table + Maddison GDP table (from GGDC). + tb_population : catalog.Table + OWID population table (from various sources). + tb_regions : catalog.Table + OWID regions table. + + Returns + ------- + combined : catalog.Table + Combined table with metadata and variables metadata. + + """ + # Gather all variables' metadata from all tables. + tables = [tb_gcp, tb_jones, tb_cait_ghg, tb_cait_ch4, tb_cait_n2o, tb_energy, tb_gdp, tb_population, tb_regions] + variables_metadata = {} + for table in tables: + for variable in table.columns: + # If variable does not have sources metadata, take them from the dataset metadata. + if len(table[variable].metadata.sources) == 0: + if table.metadata.dataset is None: + table[variable].metadata.sources = [] + else: + table[variable].metadata.sources = table.metadata.dataset.sources + variables_metadata[variable] = table[variable].metadata + + # Combine main tables (with an outer join, to gather all entities from all tables). + tables = [tb_gcp, tb_jones, tb_cait_ghg, tb_cait_ch4, tb_cait_n2o] + combined = dataframes.multi_merge(dfs=tables, on=["country", "year"], how="outer") + + # Add secondary tables (with a left join, to keep only entities for which we have emissions data). + tables = [combined, tb_energy, tb_gdp, tb_population] + combined = dataframes.multi_merge(dfs=tables, on=["country", "year"], how="left") + + # Countries-regions dataset does not have a year column, so it has to be merged on country. + combined = pd.merge(combined, tb_regions, on="country", how="left") + + # Assign variables metadata back to combined dataframe. + for variable in variables_metadata: + combined[variable].metadata = variables_metadata[variable] + + # Check that there were no repetition in column names. + error = "Repeated columns in combined data." + assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error + + # Adjust units. + combined = convert_units(combined) + + # Adjust metadata. + combined.metadata.short_name = "owid_co2" + + return combined + + +def prepare_outputs(combined: catalog.Table) -> catalog.Table: + """Clean and prepare output table. + + Parameters + ---------- + combined : catalog.Table + Combined table. + + Returns + ------- + combined: catalog.Table + Cleaned combined table. + + """ + # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). + columns_that_must_have_data = [ + column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] + ] + combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) + + # Sanity check. + columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0] + assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" + + # Set index and sort conveniently. + combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index() + + return combined + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load the global carbon budget dataset from the Global Carbon Project (GCP). + ds_gcp: catalog.Dataset = paths.load_dependency("global_carbon_budget") + + # Load the Jones et al. (2023) dataset on national contributions to climate change. + ds_jones: catalog.Dataset = paths.load_dependency("national_contributions") + + # Load the greenhouse gas emissions by sector dataset by CAIT. + ds_cait: catalog.Dataset = paths.load_dependency("ghg_emissions_by_sector") + + # Load the GDP dataset by GGDC Maddison. + ds_gdp: catalog.Dataset = paths.load_dependency("ggdc_maddison") + + # Load primary energy consumption dataset (by different sources in our 'energy' namespace). + ds_energy: catalog.Dataset = paths.load_dependency("primary_energy_consumption") + + # Load population dataset. + ds_population: catalog.Dataset = paths.load_dependency("population") + + # Load countries-regions dataset (required to get ISO codes). + ds_regions: catalog.Dataset = paths.load_dependency("regions") + + # Gather all required tables from all datasets. + tb_gcp = ds_gcp["global_carbon_budget"] + tb_jones = ds_jones["national_contributions"] + tb_cait_ghg = ds_cait["greenhouse_gas_emissions_by_sector"] + tb_cait_ch4 = ds_cait["methane_emissions_by_sector"] + tb_cait_n2o = ds_cait["nitrous_oxide_emissions_by_sector"] + tb_energy = ds_energy["primary_energy_consumption"] + tb_gdp = ds_gdp["maddison_gdp"] + tb_population = ds_population["population"] + tb_region_names = ds_regions["definitions"] + tb_region_codes = ds_regions["legacy_codes"] + + # + # Process data. + # + # Choose required columns and rename them. + tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS) + tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS) + tb_cait_ghg = tb_cait_ghg.reset_index()[list(CAIT_GHG_COLUMNS)].rename(columns=CAIT_GHG_COLUMNS) + tb_cait_ch4 = tb_cait_ch4.reset_index()[list(CAIT_CH4_COLUMNS)].rename(columns=CAIT_CH4_COLUMNS) + tb_cait_n2o = tb_cait_n2o.reset_index()[list(CAIT_N2O_COLUMNS)].rename(columns=CAIT_N2O_COLUMNS) + tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS) + tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS) + tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename(columns=POPULATION_COLUMNS) + tb_regions = ( + pd.merge(tb_region_names, tb_region_codes, left_index=True, right_index=True) + .reset_index()[list(REGIONS_COLUMNS)] + .rename(columns=REGIONS_COLUMNS) + ) + + # Combine tables. + combined = combine_tables( + tb_gcp=tb_gcp, + tb_jones=tb_jones, + tb_cait_ghg=tb_cait_ghg, + tb_cait_ch4=tb_cait_ch4, + tb_cait_n2o=tb_cait_n2o, + tb_energy=tb_energy, + tb_gdp=tb_gdp, + tb_population=tb_population, + tb_regions=tb_regions, + ) + + # Prepare outputs. + combined = prepare_outputs(combined=combined) + + # + # Save outputs. + # + ds_garden = create_dataset(dest_dir, tables=[combined]) + + # Gather metadata sources from all tables' original dataset sources. + datasets = [ + ds_gcp, + ds_jones, + ds_cait, + ds_gdp, + ds_energy, + ds_regions, + ] + sources = unique_sources_from_datasets(datasets=datasets) + + # OWID population dataset does not have sources metadata. + sources.append( + catalog.meta.Source( + name="Our World in Data based on different sources (https://ourworldindata.org/population-sources)." + ) + ) + + ds_garden.metadata.sources = sources + + # Create dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.countries.json b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.countries.json new file mode 100644 index 00000000000..abaab52fe1b --- /dev/null +++ b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.countries.json @@ -0,0 +1,278 @@ +{ + "Afghanistan": "Afghanistan", + "Africa": "Africa (GCP)", + "Albania": "Albania", + "Algeria": "Algeria", + "American Samoa": "American Samoa", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Antarctica": "Antarctica", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Asia": "Asia (GCP)", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bolivia (Plurinational State of)": "Bolivia", + "Bonaire, Saint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", + "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Bunkers": "International transport", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Central America": "Central America (GCP)", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Christmas Island": "Christmas Island", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Congo, Democratic Republic of the": "Democratic Republic of Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cura\u00e7ao": "Curacao", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "EU27": "European Union (27) (GCP)", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Europe": "Europe (GCP)", + "Faeroe Islands": "Faroe Islands", + "Falkland Islands (Malvinas)": "Falkland Islands", + "Faroe Islands": "Faroe Islands", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Equatorial Africa": "French Equatorial Africa (GCP)", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "French West Africa": "French West Africa (GCP)", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Global": "World", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guadeloupe": "Guadeloupe", + "Guatemala": "Guatemala", + "Guernsey": "Guernsey", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hong Kong": "Hong Kong", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "International Transport": "International transport", + "Iran": "Iran", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Isle of Man": "Isle of Man", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jersey": "Jersey", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Korea (Democratic People's Republic of)": "North Korea", + "Korea, Republic of": "South Korea", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kuwaiti Oil Fires": "Kuwaiti Oil Fires (GCP)", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Laos": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Leeward Islands": "Leeward Islands (GCP)", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Macao": "Macao", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Martinique": "Martinique", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mayotte": "Mayotte", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Middle East": "Middle East (GCP)", + "Moldova": "Moldova", + "Moldova, Republic of": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "Netherlands Antilles": "Netherlands Antilles", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "Non-OECD": "Non-OECD (GCP)", + "North America": "North America (GCP)", + "North Korea": "North Korea", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "OECD": "OECD (GCP)", + "Occupied Palestinian Territory": "Palestine", + "Oceania": "Oceania (GCP)", + "Oman": "Oman", + "Pacific Islands (Palau)": "Palau", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Palestine, State of": "Palestine", + "Panama": "Panama", + "Panama Canal Zone": "Panama Canal Zone (GCP)", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Romania": "Romania", + "Russia": "Russia", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Ryukyu Islands": "Ryukyu Islands (GCP)", + "R\u00e9union": "Reunion", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Martin (French part)": "Saint Martin (French part)", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South America": "South America (GCP)", + "South Korea": "South Korea", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "St. Kitts-Nevis-Anguilla": "St. Kitts-Nevis-Anguilla (GCP)", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Svalbard and Jan Mayen": "Svalbard and Jan Mayen", + "Swaziland": "Eswatini", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Syrian Arab Republic": "Syria", + "Taiwan": "Taiwan", + "Taiwan, Province of China": "Taiwan", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Tanzania, United Republic of": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "USA": "United States", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Virgin Islands (U.S.)": "United States Virgin Islands", + "Wallis and Futuna Islands": "Wallis and Futuna", + "Western Sahara": "Western Sahara", + "World": "World", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "\u00c5land Islands": "Aland Islands" +} diff --git a/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.excluded_countries.json b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.excluded_countries.json new file mode 100644 index 00000000000..e7a16636a61 --- /dev/null +++ b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.excluded_countries.json @@ -0,0 +1,4 @@ +[ + "KP Annex B", + "Non KP Annex B" +] \ No newline at end of file diff --git a/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml new file mode 100644 index 00000000000..d69c0584f73 --- /dev/null +++ b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml @@ -0,0 +1,483 @@ +dataset: + title: Global Carbon Budget (Global Carbon Project, 2023) + description: | + The Global Carbon Budget dataset is available here. + + Full reference: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Alkama, R., Arneth, A., Arora, V. K., Bates, N. R., Becker, M., Bellouin, N., Bittig, H. C., Bopp, L., Chevallier, F., Chini, L. P., Cronin, M., Evans, W., Falk, S., Feely, R. A., Gasser, T., Gehlen, M., Gkritzalis, T., Gloege, L., Grassi, G., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jain, A. K., Jersild, A., Kadono, K., Kato, E., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Landschützer, P., Lefèvre, N., Lindsay, K., Liu, J., Liu, Z., Marland, G., Mayot, N., McGrath, M. J., Metzl, N., Monacci, N. M., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K., Ono, T., Palmer, P. I., Pan, N., Pierrot, D., Pocock, K., Poulter, B., Resplandy, L., Robertson, E., Rödenbeck, C., Rodriguez, C., Rosan, T. M., Schwinger, J., Séférian, R., Shutler, J. D., Skjelvan, I., Steinhoff, T., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tanhua, T., Tans, P. P., Tian, X., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., Walker, A. P., Wanninkhof, R., Whitehead, C., Willstrand Wranne, A., Wright, R., Yuan, W., Yue, C., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2022, Earth Syst. Sci. Data, 14, 4811-4900, https://doi.org/10.5194/essd-14-4811-2022, 2022. + + Variables include each country, region and World Bank income group's share of the global population; production-based (territorial); and consumption-based (trade-adjusted) carbon dioxide emissions. + + Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. + + Note that consumption-based emissions are not available for all countries; although those without complete data are a small fraction (3%) of the global total. + + Calculation of each country's share of the global population is calculated using our population dataset, based on different sources). + + Data on global emissions has been converted by Our World in Data from tonnes of carbon to tonnes of carbon dioxide (CO₂) using a conversion factor of 3.664. + + Our World in Data have renamed bunker fuels as "International transport" for improved clarity, which includes emissions from international aviation and shipping. + Emissions from the Kuwaiti oil fires in 1991 have been included as part of Kuwait's emissions for that year. + + licenses: + - name: Creative Commons Attribution 4.0 International + url: https://zenodo.org/record/7215364 + sources: + - name: Our World in Data based on the Global Carbon Project (2023) + published_by: "Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Alkama, R., Arneth, A., Arora, V. K., Bates, N. R., Becker, M., Bellouin, N., Bittig, H. C., Bopp, L., Chevallier, F., Chini, L. P., Cronin, M., Evans, W., Falk, S., Feely, R. A., Gasser, T., Gehlen, M., Gkritzalis, T., Gloege, L., Grassi, G., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jain, A. K., Jersild, A., Kadono, K., Kato, E., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Landschützer, P., Lefèvre, N., Lindsay, K., Liu, J., Liu, Z., Marland, G., Mayot, N., McGrath, M. J., Metzl, N., Monacci, N. M., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K., Ono, T., Palmer, P. I., Pan, N., Pierrot, D., Pocock, K., Poulter, B., Resplandy, L., Robertson, E., Rödenbeck, C., Rodriguez, C., Rosan, T. M., Schwinger, J., Séférian, R., Shutler, J. D., Skjelvan, I., Steinhoff, T., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tanhua, T., Tans, P. P., Tian, X., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., Walker, A. P., Wanninkhof, R., Whitehead, C., Willstrand Wranne, A., Wright, R., Yuan, W., Yue, C., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2022, Earth Syst. Sci. Data, 14, 4811-4900, https://doi.org/10.5194/essd-14-4811-2022, 2022." + url: https://www.globalcarbonproject.org/ + date_accessed: 2023-04-28 + +tables: + global_carbon_budget: + variables: + consumption_emissions: + title: "Annual consumption-based CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Data has been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." + consumption_emissions_as_share_of_global: + title: "Share of global annual CO₂ consumption-based emissions" + unit: "%" + short_unit: "%" + description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured as a percentage of global consumption-based emissions of CO₂ in the same year. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide." + consumption_emissions_per_capita: + title: "Annual consumption-based CO₂ emissions (per capita)" + unit: "tonnes per capita" + short_unit: "t" + description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes per person. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." + consumption_emissions_per_gdp: + title: "Annual consumption-based CO₂ emissions per GDP (kg per international-$)" + unit: "kilograms per international-$" + short_unit: "kg/$" + description: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in kilograms per dollar of GDP (2011 international-$). Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." + cumulative_consumption_emissions: + title: "Cumulative CO₂ consumption-based emissions" + unit: "tonnes" + short_unit: "t" + description: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of data availability, measured in tonnes. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. Consumption-based CO₂ emissions have been converted by Our World in Data from tonnes of carbon to tonnes of CO₂ using a conversion factor of 3.664." + cumulative_consumption_emissions_as_share_of_global: + title: "Share of global cumulative CO₂ consumption-based emissions" + unit: "%" + short_unit: "%" + description: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of data availability, measured as a percentage of global cumulative consumption-based emissions of CO₂ since the first year of data availability. Consumption-based emissions are national or regional emissions which have been adjusted for trade (i.e. territorial/production emissions minus emissions embedded in exports, plus emissions embedded in imports). If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide." + cumulative_emissions_from_cement: + title: "Cumulative CO₂ emissions from cement" + unit: "tonnes" + short_unit: "t" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from cement since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + cumulative_emissions_from_cement_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from cement" + unit: "%" + short_unit: "%" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from cement since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from cement since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from cement has been calculated by Our World in Data using global CO₂ emissions from cement provided in the Global Carbon Budget dataset." + cumulative_emissions_from_coal: + title: "Cumulative CO₂ emissions from coal" + unit: "tonnes" + short_unit: "t" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from coal since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + cumulative_emissions_from_coal_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from coal" + unit: "%" + short_unit: "%" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from coal since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from coal since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from coal has been calculated by Our World in Data using global CO₂ emissions from coal provided in the Global Carbon Budget dataset." + cumulative_emissions_from_flaring: + title: "Cumulative CO₂ emissions from flaring" + unit: "tonnes" + short_unit: "t" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from flaring since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + cumulative_emissions_from_flaring_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from flaring" + unit: "%" + short_unit: "%" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from flaring since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from flaring since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from flaring has been calculated by Our World in Data using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset." + cumulative_emissions_from_gas: + title: "Cumulative CO₂ emissions from gas" + unit: "tonnes" + short_unit: "t" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from gas since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + cumulative_emissions_from_gas_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from gas" + unit: "%" + short_unit: "%" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from gas since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from gas since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from gas has been calculated by Our World in Data using global CO₂ emissions from gas provided in the Global Carbon Budget dataset." + cumulative_emissions_from_land_use_change: + title: "Cumulative CO₂ emissions from land-use change" + unit: "tonnes" + short_unit: "t" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from land-use change since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + cumulative_emissions_from_land_use_change_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from land-use change" + unit: "%" + short_unit: "%" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from land-use change since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from land-use chang since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset." + cumulative_emissions_from_oil: + title: "Cumulative CO₂ emissions from oil" + unit: "tonnes" + short_unit: "t" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from oil since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + cumulative_emissions_from_oil_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from oil" + unit: "%" + short_unit: "%" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from oil since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from oil since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from oil has been calculated by Our World in Data using global CO₂ emissions from oil provided in the Global Carbon Budget dataset. Global oil emissions include all country emissions as well as emissions from international aviation and shipping." + cumulative_emissions_from_other_industry: + title: "Cumulative CO₂ emissions from other industry" + unit: "tonnes" + short_unit: "t" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from other industry sources since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + cumulative_emissions_from_other_industry_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from other industry" + unit: "%" + short_unit: "%" + description: "Cumulative production-based emissions of carbon dioxide (CO₂) from other industry sources since the first year of data availability, measured as a percentage of global cumulative production-based emissions of CO₂ from other industry sources since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from other industry sources has been calculated by Our World in Data using global CO₂ emissions from other industry sources provided in the Global Carbon Budget dataset. Global emissions from other industry sources include all country emissions." + cumulative_emissions_total: + title: "Cumulative CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description: "Total cumulative production-based emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + cumulative_emissions_total_as_share_of_global: + title: "Share of global cumulative CO₂ emissions" + unit: "%" + short_unit: "%" + description: "Total cumulative production-based emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of data availability, measured as a percentage of global total cumulative production-based emissions of CO₂ since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." + cumulative_emissions_total_including_land_use_change: + title: "Cumulative CO₂ emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description: "Total cumulative production-based emissions of carbon dioxide (CO₂), including land-use change, since the first year of data availability, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + cumulative_emissions_total_including_land_use_change_as_share_of_global: + title: "Share of global cumulative CO₂ emissions including land-use change" + unit: "%" + short_unit: "%" + description: "Total cumulative production-based emissions of carbon dioxide (CO₂), including land-use change, since the first year of data availability, measured as a percentage of global total cumulative production-based emissions of CO₂ (including land-use change) since the first year of data availability. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." + emissions_from_cement: + title: "Annual CO₂ emissions from cement" + unit: "tonnes" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from cement, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_cement_as_share_of_global: + title: "Share of global annual CO₂ emissions from cement" + unit: "%" + short_unit: "%" + description: "Annual production-based emissions of carbon dioxide (CO₂) from cement, measured as a percentage of global production-based emissions of CO₂ from cement in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from cement has been calculated by Our World in Data using global CO₂ emissions from cement provided in the Global Carbon Budget dataset." + emissions_from_cement_per_capita: + title: "Annual CO₂ emissions from cement (per capita)" + unit: "tonnes per capita" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from cement, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_coal: + title: "Annual CO₂ emissions from coal" + unit: "tonnes" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from coal, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_coal_as_share_of_global: + title: "Share of global annual CO₂ emissions from coal" + unit: "%" + short_unit: "%" + description: "Annual production-based emissions of carbon dioxide (CO₂) from coal, measured as a percentage of global production-based emissions of CO₂ from coal in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from coal has been calculated by Our World in Data using global CO₂ emissions from coal provided in the Global Carbon Budget dataset." + emissions_from_coal_per_capita: + title: "Annual CO₂ emissions from coal (per capita)" + unit: "tonnes per capita" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from coal, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_flaring: + title: "Annual CO₂ emissions from flaring" + unit: "tonnes" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from flaring, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_flaring_as_share_of_global: + title: "Share of global annual CO₂ emissions from flaring" + unit: "%" + short_unit: "%" + description: "Annual production-based emissions of carbon dioxide (CO₂) from flaring, measured as a percentage of global production-based emissions of CO₂ from flaring in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from flaring has been calculated by Our World in Data using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset." + emissions_from_flaring_per_capita: + title: "Annual CO₂ emissions from flaring (per capita)" + unit: "tonnes per capita" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from flaring, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_gas: + title: "Annual CO₂ emissions from gas" + unit: "tonnes" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from gas, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_gas_as_share_of_global: + title: "Share of global annual CO₂ emissions from gas" + unit: "%" + short_unit: "%" + description: "Annual production-based emissions of carbon dioxide (CO₂) from gas, measured as a percentage of global production-based emissions of CO₂ from gas in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from gas has been calculated by Our World in Data using global CO₂ emissions from gas provided in the Global Carbon Budget dataset. Global gas emissions include all country emissions as well as emissions from international aviation and shipping." + emissions_from_gas_per_capita: + title: "Annual CO₂ emissions from gas (per capita)" + unit: "tonnes per capita" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from gas, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_land_use_change: + title: "Annual CO₂ emissions from land-use change" + unit: "tonnes" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_land_use_change_as_share_of_global: + title: "Share of global annual CO₂ emissions from land-use change" + unit: "%" + short_unit: "%" + description: "Annual production-based emissions of carbon dioxide (CO₂) from land-use change, measured as a percentage of global production-based emissions of CO₂ from land-use change in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset." + emissions_from_land_use_change_per_capita: + title: "Annual CO₂ emissions from land-use change per capita" + unit: "tonnes of CO₂ per capita" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_oil: + title: "Annual CO₂ emissions from oil" + unit: "tonnes" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from oil, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_oil_as_share_of_global: + title: "Share of global annual CO₂ emissions from oil" + unit: "%" + short_unit: "%" + description: "Annual production-based emissions of carbon dioxide (CO₂) from oil, measured as a percentage of global production-based emissions of CO₂ from oil in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from oil has been calculated by Our World in Data using global CO₂ emissions from oil provided in the Global Carbon Budget dataset. Global oil emissions include all country emissions as well as emissions from international aviation and shipping." + emissions_from_oil_per_capita: + title: "Annual CO₂ emissions from oil (per capita)" + unit: "tonnes per capita" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from oil, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_other_industry: + title: "Annual CO₂ emissions from other industry" + unit: "tonnes" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_from_other_industry_as_share_of_global: + title: "Share of global annual CO₂ emissions from other industry" + unit: "%" + short_unit: "%" + description: "Annual production-based emissions of carbon dioxide (CO₂) from other industry sources, measured as a percentage of global production-based emissions of CO₂ from other industry sources in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions from other industry sources has been calculated by Our World in Data using global CO₂ emissions from other industry sources provided in the Global Carbon Budget dataset. Global emissions form other industry sources include all country emissions." + emissions_from_other_industry_per_capita: + title: "Annual CO₂ emissions from other industry (per capita)" + unit: "tonnes per capita" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_total: + title: "Annual CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_total_as_share_of_global: + title: "Share of global annual CO₂ emissions" + unit: "%" + short_unit: "%" + description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured as a percentage of global production-based emissions of CO₂ in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." + emissions_total_including_land_use_change: + title: "Annual CO₂ emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_total_including_land_use_change_as_share_of_global: + title: "Share of global annual CO₂ emissions including land-use change" + unit: "%" + short_unit: "%" + description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured as a percentage of global total production-based emissions of CO₂ in the same year. This is based on territorial emissions, which do not account for emissions embedded in traded goods. Each country's share of global CO₂ emissions has been calculated by Our World in Data using global CO₂ emissions provided in the Global Carbon Budget dataset. Global emissions include all country emissions as well as emissions from international aviation and shipping." + emissions_total_including_land_use_change_per_capita: + title: "Annual CO₂ emissions including land-use change per capita" + unit: "tonnes of CO₂ per capita" + short_unit: "t" + description: "Annual production-based emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_total_including_land_use_change_per_gdp: + title: "Annual CO₂ emissions including land-use change per GDP" + unit: "kilograms per international-$" + short_unit: "kg/$" + description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per dollar of GDP (2011 international-$). Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_total_including_land_use_change_per_unit_energy: + title: "Annual CO₂ emissions including land-use change per unit energy" + unit: "kilograms per kilowatt-hour" + short_unit: "kg/kWh" + description: "Annual total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per kilowatt-hour of primary energy consumption. Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_total_per_capita: + title: "Annual CO₂ emissions (per capita)" + unit: "tonnes per capita" + short_unit: "t" + description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes per person. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_total_per_gdp: + title: "Annual CO₂ emissions per GDP (kg per international-$)" + unit: "kilograms per international-$" + short_unit: "kg/$" + description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per dollar of GDP (2011 international-$). Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." + emissions_total_per_unit_energy: + title: "Annual CO₂ emissions per unit energy (kg per kilowatt-hour)" + unit: "kilograms per kilowatt-hour" + short_unit: "kg/kWh" + description: "Annual total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per kilowatt-hour of primary energy consumption. Production-based emissions are based on territorial emissions, which do not account for emissions embedded in traded goods." + gdp: + title: "GDP" + unit: "2011 international-$" + short_unit: "$" + description: "Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time (inflation) and price differences between countries." + global_cumulative_emissions_from_cement: + title: "Global cumulative CO₂ emissions from cement" + unit: "tonnes" + short_unit: "t" + description: "" + global_cumulative_emissions_from_coal: + title: "Global cumulative CO₂ emissions from coal" + unit: "tonnes" + short_unit: "t" + description: "" + global_cumulative_emissions_from_flaring: + title: "Global cumulative CO₂ emissions from flaring" + unit: "tonnes" + short_unit: "t" + description: "" + global_cumulative_emissions_from_gas: + title: "Global cumulative CO₂ emissions from gas" + unit: "tonnes" + short_unit: "t" + description: "" + global_cumulative_emissions_from_land_use_change: + title: "Global cumulative CO₂ emissions from land-use change" + unit: "tonnes" + short_unit: "t" + description: "" + global_cumulative_emissions_from_oil: + title: "Global cumulative CO₂ emissions from oil" + unit: "tonnes" + short_unit: "t" + description: "" + global_cumulative_emissions_from_other_industry: + title: "Global cumulative CO₂ emissions from other industry" + unit: "tonnes" + short_unit: "t" + description: "" + global_cumulative_emissions_total: + title: "Global cumulative CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description: "" + global_cumulative_emissions_total_including_land_use_change: + title: "Global cumulative CO₂ emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description: "" + global_emissions_from_cement: + title: "Global annual CO₂ emissions from cement" + unit: "tonnes" + short_unit: "t" + description: "" + global_emissions_from_coal: + title: "Global annual CO₂ emissions from coal" + unit: "tonnes" + short_unit: "t" + description: "" + global_emissions_from_flaring: + title: "Global annual CO₂ emissions from flaring" + unit: "tonnes" + short_unit: "t" + description: "" + global_emissions_from_gas: + title: "Global annual CO₂ emissions from gas" + unit: "tonnes" + short_unit: "t" + description: "" + global_emissions_from_international_transport: + title: "Global annual CO₂ emissions from international transport" + unit: "tonnes" + short_unit: "t" + description: "" + global_emissions_from_land_use_change: + title: "Global annual CO₂ emissions from land-use change" + unit: "tonnes" + short_unit: "t" + description: "" + global_emissions_from_oil: + title: "Global annual CO₂ emissions from oil" + unit: "tonnes" + short_unit: "t" + description: "" + global_emissions_from_other_industry: + title: "Global annual CO₂ emissions from other industry" + unit: "tonnes" + short_unit: "t" + description: "" + global_emissions_total: + title: "Global annual CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description: "" + global_emissions_total_including_land_use_change: + title: "Global annual CO₂ emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description: "" + global_population: + title: "Global population" + unit: "persons" + short_unit: "persons" + description: "World population." + growth_emissions_total: + title: "Annual CO₂ emissions growth (abs)" + unit: "tonnes" + short_unit: "t" + description: "Annual growth in total production-based emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + growth_emissions_total_including_land_use_change: + title: "Growth rate of emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description: "Annual growth in total production-based emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + land_use_change_quality_flag: + title: "Land-use change quality flag" + unit: "" + short_unit: "" + description: "Carbon dioxide emissions from land use change vary significantly in their degree of certainty. The quality flag is 1 if the different estimates of land-use change emissions considered by the Global Carbon Project have a reasonable agrement. Otherwise the quality flag is 0. The flag is also set to zero if not all estimates have data for a given country. For a more detailed definition, see the original paper." + pct_growth_emissions_total: + title: "Annual CO₂ emissions growth (%)" + unit: "%" + short_unit: "%" + description: "Annual percentage growth in total production-based emissions of carbon dioxide (CO₂), excluding land-use change. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + pct_growth_emissions_total_including_land_use_change: + title: "Growth rate of emissions including land-use change (%)" + unit: "%" + short_unit: "%" + description: "Annual percentage growth in total production-based emissions of carbon dioxide (CO₂), including land-use change. This is based on territorial emissions, which do not account for emissions embedded in traded goods." + pct_traded_emissions: + title: "Share of annual CO₂ emissions embedded in trade" + unit: "%" + short_unit: "%" + description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured as a percentage of production-based emissions of CO₂. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." + pct_traded_emissions_including_land_use_change: + title: "Traded emissions including land-use change (%)" + unit: "%" + short_unit: "%" + description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, including land-use change, measured as a percentage of production-based emissions of CO₂, including land-use change. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." + population: + title: "Population" + unit: "persons" + short_unit: "persons" + description: "" + population_as_share_of_global: + title: "Share of population" + unit: "%" + short_unit: "%" + description: "Population, measured as a percentage of global total population in the same year." + primary_energy_consumption: + title: "Primary energy consumption" + unit: "terawatt-hours" + short_unit: "TWh" + description: "Primary energy consumption, measured in terawatt-hours per year." + traded_emissions: + title: "Annual CO₂ emissions embedded in trade" + unit: "tonnes" + short_unit: "t" + description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." + traded_emissions_including_land_use_change: + title: "Traded emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, including land-use change, measured in tonnes. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." + traded_emissions_including_land_use_change_per_capita: + title: "Traded emissions including land-use change per capita" + unit: "tonnes of CO₂ per capita" + short_unit: "t" + description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, including land-use change, measured in tonnes per person. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." + traded_emissions_per_capita: + title: "Annual CO₂ emissions embedded in trade (per capita)" + unit: "tonnes per capita" + short_unit: "t" + description: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes per person. Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter." diff --git a/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.py b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.py new file mode 100644 index 00000000000..f6fb09796e3 --- /dev/null +++ b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.py @@ -0,0 +1,945 @@ +"""This step creates the Global Carbon Budget (GCB) dataset, by the Global Carbon Project (GCP). + +It harmonizes and further processes meadow data, and uses the following auxiliary datasets: +- GGDC's Maddison dataset on GDP, used to calculate emissions per GDP. +- Primary Energy Consumption (mix of sources from the 'energy' namespace) to calculate emissions per unit energy. +- Population (mix of sources), to calculate emissions per capita. +- Regions (mix of sources), to generate aggregates for different continents. +- WorldBank's Income groups, to generate aggregates for different income groups. + +""" + +import numpy as np +import pandas as pd +from owid.catalog import Dataset, Table +from owid.datautils import dataframes +from structlog import get_logger + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Expected outliers in consumption-based emissions (with negative emissions in the original data, that will be removed). +OUTLIERS_IN_CONSUMPTION_DF = [ + ("Panama", 2003), + ("Panama", 2004), + ("Panama", 2005), + ("Panama", 2006), + ("Panama", 2011), + ("Panama", 2012), + ("Panama", 2013), + ("Venezuela", 2018), +] + +# Label used for international transport (emissions from oil in bunker fuels), included as a country in the +# fossil CO2 emissions dataset. +INTERNATIONAL_TRANSPORT_LABEL = "International Transport" + +# Regions and income groups to create by aggregating contributions from member countries. +# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. +# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and +# "countries_excluded". The aggregates will be calculated on the resulting countries. +REGIONS = { + # Default continents. + "Africa": {}, + "Asia": {}, + "Europe": {}, + "European Union (27)": {}, + "North America": {}, + "Oceania": {}, + "South America": {}, + # Income groups. + "Low-income countries": {}, + "Upper-middle-income countries": {}, + "Lower-middle-income countries": {}, + "High-income countries": {}, + # Additional composite regions. + "Asia (excl. China and India)": { + "additional_regions": ["Asia"], + "excluded_members": ["China", "India"], + }, + "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]}, + "Europe (excl. EU-28)": { + "additional_regions": ["Europe"], + "excluded_regions": ["European Union (27)"], + "excluded_members": ["United Kingdom"], + }, + "European Union (28)": { + "additional_regions": ["European Union (27)"], + "additional_members": ["United Kingdom"], + }, + "North America (excl. USA)": { + "additional_regions": ["North America"], + "excluded_members": ["United States"], + }, +} + +# Columns to use from GCB fossil CO2 emissions data and how to rename them. +CO2_COLUMNS = { + "country": "country", + "year": "year", + "cement": "emissions_from_cement", + "coal": "emissions_from_coal", + "flaring": "emissions_from_flaring", + "gas": "emissions_from_gas", + "oil": "emissions_from_oil", + "other": "emissions_from_other_industry", + "total": "emissions_total", +} + +# List all sources of emissions considered. +EMISSION_SOURCES = [column for column in CO2_COLUMNS.values() if column not in ["country", "year"]] + +# Columns to use from primary energy consumption data and how to rename them. +PRIMARY_ENERGY_COLUMNS = { + "country": "country", + "year": "year", + "primary_energy_consumption__twh": "primary_energy_consumption", +} + +# Columns to use from GDP data and how to rename them. +GDP_COLUMNS = { + "country": "country", + "year": "year", + "gdp": "gdp", +} + +# Columns to use from primary energy consumption data and how to rename them. +HISTORICAL_EMISSIONS_COLUMNS = { + "country": "country", + "year": "year", + # Global fossil emissions are used only for sanity checks. + "global_fossil_emissions": "global_fossil_emissions", + "global_land_use_change_emissions": "global_emissions_from_land_use_change", +} + +# Columns to use from consumption-based emissions data and how to rename them. +CONSUMPTION_EMISSIONS_COLUMNS = { + "country": "country", + "year": "year", + "consumption_emissions": "consumption_emissions", +} + +# Conversion from terawatt-hours to kilowatt-hours. +TWH_TO_KWH = 1e9 + +# Conversion factor to change from billion tonnes of carbon to tonnes of CO2. +BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e9 + +# Conversion factor to change from million tonnes of carbon to tonnes of CO2. +MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e6 + +# Conversion from million tonnes of CO2 to tonnes of CO2. +MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 = 1e6 + +# Conversion from tonnes of CO2 to kg of CO2 (used for emissions per GDP and per unit energy). +TONNES_OF_CO2_TO_KG_OF_CO2 = 1000 + +# In order to remove uninformative columns, keep only rows where at least one of the following columns has data. +# All other columns are either derived variables, or global variables, or auxiliary variables from other datasets. +COLUMNS_THAT_MUST_HAVE_DATA = [ + "emissions_from_cement", + "emissions_from_coal", + "emissions_from_flaring", + "emissions_from_gas", + "emissions_from_oil", + "emissions_from_other_industry", + "emissions_total", + "consumption_emissions", + "emissions_from_land_use_change", + # 'land_use_change_quality_flag', +] + + +def sanity_checks_on_input_data( + df_production: pd.DataFrame, df_consumption: pd.DataFrame, df_historical: pd.DataFrame, df_co2: pd.DataFrame +) -> None: + """Run sanity checks on input data files. + + These checks should be used prior to country harmonization, but after basic processing of the dataframes. + + Parameters + ---------- + df_production : pd.DataFrame + Production-based emissions from GCP's official national emissions dataset (excel file). + df_consumption : pd.DataFrame + Consumption-based emissions from GCP's official national emissions dataset (excel file). + df_historical : pd.DataFrame + Historical emissions from GCP's official global emissions dataset (excel file). + df_co2 : pd.DataFrame + Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). + + """ + df_production = df_production.copy() + df_consumption = df_consumption.copy() + df_historical = df_historical.copy() + df_co2 = df_co2.copy() + + # In the original data, Bunkers was included in the national data file, as another country. + # But I suppose it should be considered as another kind of global emission. + # In fact, bunker emissions should coincide for production and consumption emissions. + global_bunkers_emissions = ( + df_production[df_production["country"] == "Bunkers"][["year", "production_emissions"]] + .reset_index(drop=True) + .rename(columns={"production_emissions": "global_bunker_emissions"}, errors="raise") + ) + + # Check that we get exactly the same array of bunker emissions from the consumption emissions dataframe + # (on years where there is data for bunker emissions in both datasets). + comparison = pd.merge( + global_bunkers_emissions, + df_consumption[df_consumption["country"] == "Bunkers"][["year", "consumption_emissions"]] + .reset_index(drop=True) + .rename(columns={"consumption_emissions": "global_bunker_emissions"}, errors="raise"), + how="inner", + on="year", + suffixes=("", "_check"), + ) + + error = "Bunker emissions were expected to coincide in production and consumption emissions dataframes." + assert (comparison["global_bunker_emissions"] == comparison["global_bunker_emissions_check"]).all(), error + + # Check that all production-based emissions are positive. + error = "There are negative emissions in df_production (from the additional variables dataset)." + assert (df_production.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + + # Check that all production-based emissions from the fossil CO2 dataset are positive. + error = "There are negative emissions in df_co2 (from the fossil CO2 dataset)." + assert (df_co2.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + + # Check that all consumption-based emissions are positive. + error = "There are negative emissions in df_consumption (from the national emissions dataset)." + assert (df_consumption.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + + # Check that, for the World, production emissions coincides with consumption emissions (on common years). + error = "Production and consumption emissions for the world were expected to be identical." + comparison = pd.merge( + df_production[df_production["country"] == "World"].reset_index(drop=True), + df_consumption[df_consumption["country"] == "World"].reset_index(drop=True), + how="inner", + on="year", + ) + assert (comparison["production_emissions"] == comparison["consumption_emissions"]).all(), error + + # Check that production emissions for the World coincide with global (historical) emissions (on common years). + comparison = pd.merge( + df_production[df_production["country"] == "World"][["year", "production_emissions"]].reset_index(drop=True), + df_historical[["year", "global_fossil_emissions"]], + how="inner", + on="year", + ) + error = "Production emissions for the world were expected to coincide with global fossil emissions." + assert ( + abs(comparison["production_emissions"] - comparison["global_fossil_emissions"]) + / (comparison["global_fossil_emissions"]) + < 0.001 + ).all(), error + + # Check that emissions in df_production (emissions from the national excel file) coincide with emissions in df_co2 + # (from the Fossil CO2 emissions csv file). + # Given that country names have not yet been harmonized, rename the only countries that are present in both datasets. + comparison = pd.merge( + df_co2[["country", "year", "emissions_total"]], + df_production.replace({"Bunkers": "International Transport", "World": "Global"}), + on=["country", "year"], + how="inner", + ).dropna(subset=["emissions_total", "production_emissions"], how="any") + # Since we included the emissions from the Kuwaiti oil fires in Kuwait (and they are not included in df_production), + # omit that row in the comparison. + comparison = comparison.drop( + comparison[(comparison["country"] == "Kuwait") & (comparison["year"] == 1991)].index + ).reset_index(drop=True) + + error = "Production emissions from national file were expected to coincide with the Fossil CO2 emissions dataset." + assert ( + ( + 100 + * abs(comparison["production_emissions"] - comparison["emissions_total"]) + / (comparison["emissions_total"]) + ).fillna(0) + < 0.1 + ).all(), error + + +def sanity_checks_on_output_data(combined_df: pd.DataFrame) -> None: + """Run sanity checks on output data. + + These checks should be run on the very final output dataframe (with an index) prior to storing it as a table. + + Parameters + ---------- + combined_df : pd.DataFrame + Combination of all input dataframes, after processing, harmonization, and addition of variables. + + """ + combined_df = combined_df.reset_index() + error = "All variables (except traded emissions, growth, and land-use change) should be >= 0 or nan." + positive_variables = [ + col + for col in combined_df.columns + if col != "country" + if "traded" not in col + if "growth" not in col + if "land_use" not in col + ] + assert (combined_df[positive_variables].fillna(0) >= 0).all().all(), error + + error = "Production emissions as a share of global emissions should be 100% for 'World' (within 2% error)." + assert combined_df[ + (combined_df["country"] == "World") & (abs(combined_df["emissions_total_as_share_of_global"] - 100) > 2) + ].empty, error + + error = "Consumption emissions as a share of global emissions should be 100% for 'World' (within 2% error)." + assert combined_df[ + (combined_df["country"] == "World") & (abs(combined_df["consumption_emissions_as_share_of_global"] - 100) > 2) + ].empty, error + + error = "Population as a share of global population should be 100% for 'World'." + assert combined_df[ + (combined_df["country"] == "World") & (combined_df["population_as_share_of_global"].fillna(100) != 100) + ].empty, error + + error = "All share of global emissions should be smaller than 100% (within 2% error)." + share_variables = [col for col in combined_df.columns if "share" in col] + assert (combined_df[share_variables].fillna(0) <= 102).all().all(), error + + # Check that cumulative variables are monotonically increasing. + # Firstly, list columns of cumulative variables, but ignoring cumulative columns as a share of global + # (since they are not necessarily monotonic) and land-use change (which can be negative). + cumulative_cols = [ + col for col in combined_df.columns if "cumulative" in col if "share" not in col if "land_use" not in col + ] + # Using ".is_monotonic_increasing" can fail when differences between consecutive numbers are very small. + # Instead, sort data backwards in time, and check that consecutive values of cumulative variables always have + # a percentage change that is smaller than, say, 0.1%. + error = ( + "Cumulative variables (not given as a share of global) should be monotonically increasing (except when " + "including land-use change emissions, which can be negative)." + ) + assert ( + combined_df.sort_values("year", ascending=False) + .groupby("country") + .agg({col: lambda x: ((x.pct_change().dropna() * 100) <= 0.1).all() for col in cumulative_cols}) + .all() + .all() + ), error + + error = ( + "Production emissions as a share of global production emissions for the World should always be 100% " + "(or larger than 98%, given small discrepancies)." + ) + # Consumption emissions as a share of global production emissions is allowed to be smaller than 100%. + share_variables = [col for col in combined_df.columns if "share" in col if "consumption" not in col] + assert (combined_df[combined_df["country"] == "World"][share_variables].fillna(100) > 98).all().all(), error + + error = "Traded emissions for the World should be close to zero (within 2% error)." + world_mask = combined_df["country"] == "World" + assert ( + abs( + 100 + * combined_df[world_mask]["traded_emissions"].fillna(0) + / combined_df[world_mask]["emissions_total"].fillna(1) + ) + < 2 + ).all(), error + + +def prepare_fossil_co2_emissions(df_co2: pd.DataFrame) -> pd.DataFrame: + """Prepare Fossil CO2 emissions data (basic processing).""" + # Select and rename columns from fossil CO2 data. + df_co2 = df_co2[list(CO2_COLUMNS)].rename(columns=CO2_COLUMNS, errors="raise") + + # Ensure all emissions are given in tonnes of CO2. + df_co2[EMISSION_SOURCES] *= MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 + + #################################################################################################################### + # NOTE: For certain years, column "emissions_from_other_industry" is not informed for "World" but it is informed + # for some countries (namely China and US). + # This causes the cumulative emissions from other industry as share of global for those countries to become larger + # than 100%. + # This temporary solution fixes the issue: We aggregate the data for China and US on those years when the world's + # data is missing (without touching other years or other columns). + # Firstly, list of years for which the world has no data for emissions_from_other_industry. + world_missing_years = ( + df_co2[(df_co2["country"] == "Global") & (df_co2["emissions_from_other_industry"].isnull())]["year"] + .unique() + .tolist() # type: ignore + ) + # Data that needs to be aggregated. + data_missing_in_world = df_co2[ + df_co2["year"].isin(world_missing_years) & (df_co2["emissions_from_other_industry"].notnull()) + ] + # Check that there is indeed data to be aggregated (that is missing for the World). + error = ( + "Expected emissions_from_other_industry to be null for the world but not null for certain countries " + "(which was an issue in the original fossil CO2 data). The issue may be fixed and the code can be simplified." + ) + assert len(data_missing_in_world) > 0, error + # Create a dataframe of aggregate data for the World, on those years when it's missing. + aggregated_missing_data = ( + data_missing_in_world.groupby("year") + .agg({"emissions_from_other_industry": "sum"}) + .reset_index() + .assign(**{"country": "Global"}) + ) + # Combine the new dataframe of aggregate data with the main dataframe. + df_co2 = dataframes.combine_two_overlapping_dataframes( + df1=df_co2, df2=aggregated_missing_data, index_columns=["country", "year"], keep_column_order=True + ) + #################################################################################################################### + + # We add the emissions from "Kuwaiti Oil Fires" (which is also included as a separate country) as part of the + # emissions of Kuwait. This ensures that they will be included in region aggregates. + error = "'Kuwaiti Oil Fires' was expected to only have not-null data for 1991." + assert df_co2[ + (df_co2["country"] == "Kuwaiti Oil Fires") + & (df_co2["emissions_total"].notnull()) + & (df_co2["emissions_total"] != 0) + ]["year"].tolist() == [1991], error + + df_co2.loc[(df_co2["country"] == "Kuwait") & (df_co2["year"] == 1991), EMISSION_SOURCES] = ( + df_co2[(df_co2["country"] == "Kuwaiti Oil Fires") & (df_co2["year"] == 1991)][EMISSION_SOURCES].values + + df_co2[(df_co2["country"] == "Kuwait") & (df_co2["year"] == 1991)][EMISSION_SOURCES].values + ) + + # Check that "emissions_total" agrees with the sum of emissions from individual sources. + error = "The sum of all emissions should add up to total emissions (within 1%)." + assert ( + abs( + df_co2.drop(columns=["country", "year", "emissions_total"]).sum(axis=1) + - df_co2["emissions_total"].fillna(0) + ) + / (df_co2["emissions_total"].fillna(0) + 1e-7) + < 1e-2 + ).all(), error + + # Many rows have zero total emissions, but actually the individual sources are nan. + # Total emissions in those cases should be nan, instead of zero. + no_individual_emissions = df_co2.drop(columns=["country", "year", "emissions_total"]).isnull().all(axis=1) + df_co2.loc[no_individual_emissions, "emissions_total"] = np.nan + + return df_co2 + + +def prepare_consumption_emissions(df_consumption: pd.DataFrame) -> pd.DataFrame: + """Prepare consumption-based emissions data (basic processing).""" + # Select and rename columns. + df_consumption = df_consumption[list(CONSUMPTION_EMISSIONS_COLUMNS)].rename( + columns=CONSUMPTION_EMISSIONS_COLUMNS, errors="raise" + ) + + # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. + for column in df_consumption.drop(columns=["country", "year"]).columns: + df_consumption[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + # List indexes of rows in df_consumption corresponding to outliers (defined above in OUTLIERS_IN_df_consumption). + outlier_indexes = [ + df_consumption[(df_consumption["country"] == outlier[0]) & (df_consumption["year"] == outlier[1])].index.item() + for outlier in OUTLIERS_IN_CONSUMPTION_DF + ] + + error = ( + "Outliers were expected to have negative consumption emissions. " + "Maybe outliers have been fixed (and should be removed from the code)." + ) + assert (df_consumption.loc[outlier_indexes]["consumption_emissions"] < 0).all(), error + + # Remove outliers. + df_consumption = df_consumption.drop(outlier_indexes).reset_index(drop=True) + + return df_consumption + + +def prepare_production_emissions(df_production: pd.DataFrame) -> pd.DataFrame: + """Prepare production-based emissions data (basic processing).""" + # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. + for column in df_production.drop(columns=["country", "year"]).columns: + df_production[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + return df_production + + +def prepare_land_use_emissions(df_land_use: pd.DataFrame) -> pd.DataFrame: + """Prepare land-use change emissions data (basic processing).""" + # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. + df_land_use["emissions"] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + return df_land_use + + +def prepare_historical_emissions(df_historical: pd.DataFrame) -> pd.DataFrame: + """Prepare historical emissions data.""" + # Select and rename columns from historical emissions data. + df_historical = df_historical[list(HISTORICAL_EMISSIONS_COLUMNS)].rename( + columns=HISTORICAL_EMISSIONS_COLUMNS, errors="raise" + ) + + # Convert units from gigatonnes of carbon per year emissions to tonnes of CO2 per year. + for column in df_historical.drop(columns=["country", "year"]).columns: + df_historical[column] *= BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + return df_historical + + +def extract_global_emissions(df_co2: pd.DataFrame, df_historical: pd.DataFrame) -> pd.DataFrame: + """Extract World emissions by combining data from the Fossil CO2 emissions and the global emissions dataset. + + The resulting global emissions data includes bunker and land-use change emissions. + + NOTE: This function has to be used after selecting and renaming columns in df_co2, but before harmonizing country + names in df_co2 (so that "International Transport" is still listed as a country). + + Parameters + ---------- + df_co2 : pd.DataFrame + Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). + df_historical : pd.DataFrame + Historical emissions from GCP's official global emissions dataset (excel file). + + Returns + ------- + global_emissions : pd.DataFrame + World emissions. + + """ + # For some reason, "International Transport" is included as another country, that only has emissions from oil. + # We separate it as another variable (only given at the global level). + global_transport = df_co2[df_co2["country"] == INTERNATIONAL_TRANSPORT_LABEL].reset_index(drop=True) + + # Check that total emissions for international transport coincide with oil emissions. + error = "Total emissions from international transport do not coincide with oil emissions." + assert all((global_transport["emissions_from_oil"] - global_transport["emissions_total"]).dropna() == 0), error + + # Therefore, we can keep only one column for international transport emissions. + global_transport = ( + global_transport[["year", "emissions_from_oil"]] + .dropna() + .rename(columns={"emissions_from_oil": "global_emissions_from_international_transport"}, errors="raise") + ) + + # Create a new dataframe of global emissions. + global_emissions = ( + df_co2[df_co2["country"].isin(["Global", "World"])][["year"] + EMISSION_SOURCES] + .rename(columns={column: f"global_{column}" for column in EMISSION_SOURCES}, errors="raise") + .sort_values("year") + .reset_index(drop=True) + ) + + # Add bunker fuels to global emissions. + global_emissions = pd.merge(global_emissions, global_transport, on=["year"], how="outer") + + # Add historical land-use change emissions to dataframe of global emissions. + global_emissions = pd.merge( + global_emissions, df_historical[["year", "global_emissions_from_land_use_change"]], how="left", on="year" + ) + + # Add variable of total emissions including fossil fuels and land use change. + global_emissions["global_emissions_total_including_land_use_change"] = ( + global_emissions["global_emissions_total"] + global_emissions["global_emissions_from_land_use_change"] + ) + + # Calculate global cumulative emissions. + for column in EMISSION_SOURCES + ["emissions_from_land_use_change", "emissions_total_including_land_use_change"]: + global_emissions[f"global_cumulative_{column}"] = global_emissions[f"global_{column}"].cumsum() + + # Add a country column and add global population. + global_emissions["country"] = "World" + + # Add global population. + global_emissions = geo.add_population_to_dataframe(df=global_emissions, population_col="global_population") + + return global_emissions + + +def harmonize_country_names(df: pd.DataFrame) -> pd.DataFrame: + """Harmonize country names, and fix known issues with certain regions. + + Parameters + ---------- + df : pd.DataFrame + Emissions data (either from the fossil CO2, the production-based, consumption-based, or land-use emissions + datasets). + + Returns + ------- + df : pd.DataFrame + Emissions data after harmonizing country names. + + """ + # Harmonize country names. + df = geo.harmonize_countries( + df=df, + countries_file=paths.country_mapping_path, + excluded_countries_file=paths.excluded_countries_path, + warn_on_missing_countries=True, + warn_on_unused_countries=False, + make_missing_countries_nan=False, + warn_on_unknown_excluded_countries=False, + ) + + # Check that there is only one data point for each country-year. + # In the fossil CO2 emissions data, after harmonization, "Pacific Islands (Palau)" is mapped to "Palau", and + # therefore there are rows with different data for the same country-year. + # However, "Pacific Islands (Palau)" have data until 1991, and "Palau" has data from 1992 onwards. + # After removing empty rows, there should be no overlap. + columns_that_must_have_data = df.drop(columns=["country", "year"]).columns + check = df.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) + error = "After harmonizing country names, there is more than one data point for the same country-year." + assert check[check.duplicated(subset=["country", "year"])].empty, error + + df = df.dropna(subset="country").reset_index(drop=True) + + return df + + +def combine_data_and_add_variables( + df_co2: pd.DataFrame, + df_production: pd.DataFrame, + df_consumption: pd.DataFrame, + df_global_emissions: pd.DataFrame, + df_land_use: pd.DataFrame, + df_gdp: pd.DataFrame, + df_energy: pd.DataFrame, + df_population: pd.DataFrame, + ds_regions: Dataset, + ds_income_groups: Dataset, +) -> Table: + """Combine all relevant data into one dataframe, add region aggregates, and add custom variables (e.g. emissions per + capita). + + Parameters + ---------- + df_co2 : pd.DataFrame + Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file), after harmonization. + df_production : pd.DataFrame + Production-based emissions from GCP's official national emissions dataset (excel file), after harmonization. + df_consumption : pd.DataFrame + Consumption-based emissions from GCP's official national emissions dataset (excel file), after harmonization. + df_global_emissions : pd.DataFrame + World emissions (including bunker and land-use change emissions). + df_land_use : pd.DataFrame + National land-use change emissions from GCP's official dataset (excel file), after harmonization. + df_gdp : pd.DataFrame + GDP data. + df_energy : pd.DataFrame + Primary energy data. + df_population : pd.DataFrame + Population data. + ds_regions : Dataset + Regions dataset. + ds_income_groups : Dataset + Income groups dataset. + + Returns + ------- + tb_combined : Table + Combined data, with all additional variables and with region aggregates. + + """ + # Add region aggregates that were included in the national emissions file, but not in the Fossil CO2 emissions dataset. + gcp_aggregates = sorted(set(df_production["country"]) - set(df_co2["country"])) + df_co2 = pd.concat( + [ + df_co2, + df_production[df_production["country"].isin(gcp_aggregates)] + .rename(columns={"production_emissions": "emissions_total"}) + .astype({"year": int}), + ], + ignore_index=True, + ).reset_index(drop=True) + + # Add consumption emissions to main dataframe (keep only the countries of the main dataframe). + # Given that additional GCP regions (e.g. "Africa (GCP)") have already been added to df_co2 + # (when merging with df_production), all countries from df_consumption should be included in df_co2. + error = "Some countries in df_consumption are not included in df_co2." + assert set(df_consumption["country"]) < set(df_co2["country"]), error + df_co2 = pd.merge(df_co2, df_consumption, on=["country", "year"], how="outer") + + # Add population to original dataframe. + df_co2 = pd.merge(df_co2, df_population[["country", "year", "population"]], on=["country", "year"], how="left") + + # Add GDP to main dataframe. + df_co2 = pd.merge(df_co2, df_gdp, on=["country", "year"], how="left") + + # Add primary energy to main dataframe. + df_co2 = pd.merge(df_co2, df_energy, on=["country", "year"], how="left") + + # For convenience, rename columns in land-use change emissions data. + df_land_use = df_land_use.rename( + columns={"emissions": "emissions_from_land_use_change", "quality_flag": "land_use_change_quality_flag"} + ) + + # Land-use change data does not include data for the World. Include it by merging with the global dataset. + df_land_use = pd.concat( + [ + df_land_use, + df_global_emissions.rename( + columns={"global_emissions_from_land_use_change": "emissions_from_land_use_change"} + )[["year", "emissions_from_land_use_change"]] + .dropna() + .assign(**{"country": "World"}), + ], + ignore_index=True, + ).astype({"year": int}) + + # Add land-use change emissions to main dataframe. + df_co2 = pd.merge(df_co2, df_land_use, on=["country", "year"], how="outer") + + # Add total emissions (including land-use change) for each country. + df_co2["emissions_total_including_land_use_change"] = ( + df_co2["emissions_total"] + df_co2["emissions_from_land_use_change"] + ) + + # Add region aggregates. + # Aggregate not only emissions data, but also population, gdp and primary energy. + # This way we ensure that custom regions (e.g. "North America (excl. USA)") will have all required data. + aggregations = { + column: "sum" for column in df_co2.columns if column not in ["country", "year", "land_use_change_quality_flag"] + } + for region in REGIONS: + countries_in_region = geo.list_members_of_region( + region=region, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + additional_regions=REGIONS[region].get("additional_regions", None), + excluded_regions=REGIONS[region].get("excluded_regions", None), + additional_members=REGIONS[region].get("additional_members", None), + excluded_members=REGIONS[region].get("excluded_members", None), + ) + df_co2 = geo.add_region_aggregates( + df=df_co2, + region=region, + countries_in_region=countries_in_region, + countries_that_must_have_data=[], + frac_allowed_nans_per_year=0.999, + aggregations=aggregations, + ) + + # Add global emissions and global cumulative emissions columns to main dataframe. + df_co2 = pd.merge(df_co2, df_global_emissions.drop(columns="country"), on=["year"], how="left") + + # Ensure main dataframe is sorted (so that cumulative emissions are properly calculated). + df_co2 = df_co2.sort_values(["country", "year"]).reset_index(drop=True) + + # Temporarily add certain global emissions variables. + # This is done simply to be able to consider "consumption_emissions" as just another type of emission + # when creating additional variables. + df_co2["global_consumption_emissions"] = df_co2["global_emissions_total"] + df_co2["global_cumulative_consumption_emissions"] = df_co2["global_cumulative_emissions_total"] + + # Add new variables for each source of emissions. + for column in EMISSION_SOURCES + [ + "consumption_emissions", + "emissions_from_land_use_change", + "emissions_total_including_land_use_change", + ]: + # Add per-capita variables. + df_co2[f"{column}_per_capita"] = df_co2[column] / df_co2["population"] + + # Add columns for cumulative emissions. + # Rows that had nan emissions will have nan cumulative emissions. + # But nans will not be propagated in the sum. + # This means that countries with some (not all) nans will have the cumulative sum of the informed emissions + # (treating nans as zeros), but will have nan on those rows that were not informed. + df_co2[f"cumulative_{column}"] = df_co2.groupby(["country"])[column].cumsum() + + # Add share of global emissions. + df_co2[f"{column}_as_share_of_global"] = 100 * df_co2[column] / df_co2[f"global_{column}"] + + # Add share of global cumulative emissions. + df_co2[f"cumulative_{column}_as_share_of_global"] = ( + 100 * df_co2[f"cumulative_{column}"] / df_co2[f"global_cumulative_{column}"] + ) + + # Add total emissions per unit energy (in kg of emissions per kWh). + df_co2["emissions_total_per_unit_energy"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 * df_co2["emissions_total"] / (df_co2["primary_energy_consumption"] * TWH_TO_KWH) + ) + + # Add total emissions (including land-use change) per unit energy (in kg of emissions per kWh). + df_co2["emissions_total_including_land_use_change_per_unit_energy"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 + * df_co2["emissions_total_including_land_use_change"] + / (df_co2["primary_energy_consumption"] * TWH_TO_KWH) + ) + + # Add total emissions per unit GDP. + df_co2["emissions_total_per_gdp"] = TONNES_OF_CO2_TO_KG_OF_CO2 * df_co2["emissions_total"] / df_co2["gdp"] + + # Add total emissions (including land-use change) per unit GDP. + df_co2["emissions_total_including_land_use_change_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 * df_co2["emissions_total_including_land_use_change"] / df_co2["gdp"] + ) + + # Add total consumption emissions per unit GDP. + df_co2["consumption_emissions_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 * df_co2["consumption_emissions"] / df_co2["gdp"] + ) + + # Add variable of emissions embedded in trade. + df_co2["traded_emissions"] = df_co2["consumption_emissions"] - df_co2["emissions_total"] + df_co2["pct_traded_emissions"] = 100 * df_co2["traded_emissions"] / df_co2["emissions_total"] + df_co2["traded_emissions_per_capita"] = df_co2["traded_emissions"] / df_co2["population"] + + # Add variable of emissions embedded in trade, including land-use change emissions. + df_co2["traded_emissions_including_land_use_change"] = ( + df_co2["consumption_emissions"] - df_co2["emissions_total_including_land_use_change"] + ) + df_co2["pct_traded_emissions_including_land_use_change"] = ( + 100 * df_co2["traded_emissions_including_land_use_change"] / df_co2["emissions_total_including_land_use_change"] + ) + df_co2["traded_emissions_including_land_use_change_per_capita"] = ( + df_co2["traded_emissions_including_land_use_change"] / df_co2["population"] + ) + + # Remove temporary columns. + df_co2 = df_co2.drop(columns=["global_consumption_emissions", "global_cumulative_consumption_emissions"]) + + # Add annual percentage growth of total emissions. + df_co2["pct_growth_emissions_total"] = df_co2.groupby("country")["emissions_total"].pct_change() * 100 + + # Add annual percentage growth of total emissions (including land-use change). + df_co2["pct_growth_emissions_total_including_land_use_change"] = ( + df_co2.groupby("country")["emissions_total_including_land_use_change"].pct_change() * 100 + ) + + # Add annual absolute growth of total emissions. + df_co2["growth_emissions_total"] = df_co2.groupby("country")["emissions_total"].diff() + + # Add annual absolute growth of total emissions (including land-use change). + df_co2["growth_emissions_total_including_land_use_change"] = df_co2.groupby("country")[ + "emissions_total_including_land_use_change" + ].diff() + + # Create variable of population as a share of global population. + df_co2["population_as_share_of_global"] = df_co2["population"] / df_co2["global_population"] * 100 + + # Replace infinity values (for example when calculating growth from zero to non-zero) in the data by nan. + for column in df_co2.drop(columns=["country", "year"]).columns: + df_co2.loc[np.isinf(df_co2[column]), column] = np.nan + + # For special GCP countries/regions (e.g. "Africa (GCP)") we should keep only the original data. + # Therefore, make nan all additional variables for those countries/regions, and keep only GCP's original data. + added_variables = df_co2.drop(columns=["country", "year"] + COLUMNS_THAT_MUST_HAVE_DATA).columns.tolist() + df_co2.loc[(df_co2["country"].str.contains(" (GCP)", regex=False)), added_variables] = np.nan + + # Remove uninformative rows (those that have only data for, say, gdp, but not for variables related to emissions). + df_co2 = df_co2.dropna(subset=COLUMNS_THAT_MUST_HAVE_DATA, how="all").reset_index(drop=True) + + # Set an appropriate index, ensure there are no rows that only have nan, and sort conveniently. + df_co2 = df_co2.set_index(["country", "year"], verify_integrity=True) + df_co2 = df_co2.dropna(subset=df_co2.columns, how="all").sort_index().sort_index(axis=1) + + # Create a table with the generated data. + tb_combined = Table(df_co2, short_name=paths.short_name, underscore=True) + + return tb_combined + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read all its tables. + ds_meadow: Dataset = paths.load_dependency("global_carbon_budget") + tb_co2 = ds_meadow["global_carbon_budget_fossil_co2_emissions"] + tb_historical = ds_meadow["global_carbon_budget_historical_budget"] + tb_consumption = ds_meadow["global_carbon_budget_consumption_emissions"] + tb_production = ds_meadow["global_carbon_budget_production_emissions"] + tb_land_use = ds_meadow["global_carbon_budget_land_use_change"] + + # Load primary energy consumption dataset and read its main table. + ds_energy: Dataset = paths.load_dependency("primary_energy_consumption") + tb_energy = ds_energy["primary_energy_consumption"] + + # Load GDP dataset and read its main table. + ds_gdp: Dataset = paths.load_dependency("ggdc_maddison") + tb_gdp = ds_gdp["maddison_gdp"] + + # Load population dataset and read its main table. + ds_population: Dataset = paths.load_dependency("population") + tb_population = ds_population["population"] + + # Load regions dataset and read its main tables (it will be used to create region aggregates). + ds_regions: Dataset = paths.load_dependency("regions") + + # Load income groups dataset and read its main table (it will be used to create region aggregates). + ds_income_groups: Dataset = paths.load_dependency("wb_income") + + # Create a dataframe for each table. + df_co2 = pd.DataFrame(tb_co2).reset_index() + df_historical = pd.DataFrame(tb_historical).reset_index() + df_consumption = pd.DataFrame(tb_consumption).reset_index() + df_production = pd.DataFrame(tb_production).reset_index() + df_land_use = pd.DataFrame(tb_land_use).reset_index() + df_energy = pd.DataFrame(tb_energy).reset_index() + df_gdp = pd.DataFrame(tb_gdp).reset_index() + df_population = pd.DataFrame(tb_population).reset_index() + + # + # Process data. + # + # Prepare fossil CO2 emissions data. + df_co2 = prepare_fossil_co2_emissions(df_co2=df_co2) + + # Prepare consumption-based emission data. + df_consumption = prepare_consumption_emissions(df_consumption=df_consumption) + + # Prepare production-based emission data. + df_production = prepare_production_emissions(df_production=df_production) + + # Prepare land-use emission data. + df_land_use = prepare_land_use_emissions(df_land_use=df_land_use) + + # Select and rename columns from primary energy data. + df_energy = df_energy[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS, errors="raise") + + # Select and rename columns from primary energy data. + df_gdp = df_gdp[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") + + # Prepare historical emissions data. + df_historical = prepare_historical_emissions(df_historical=df_historical) + + # Run sanity checks on input data. + sanity_checks_on_input_data( + df_production=df_production, df_consumption=df_consumption, df_historical=df_historical, df_co2=df_co2 + ) + + # For some reason, "International Transport" is included as another country, that only has emissions from oil. + # Extract that data and remove it from the rest of national emissions. + df_global_emissions = extract_global_emissions(df_co2=df_co2, df_historical=df_historical) + + # Harmonize country names. + df_co2 = harmonize_country_names(df=df_co2) + df_consumption = harmonize_country_names(df=df_consumption) + df_production = harmonize_country_names(df=df_production) + df_land_use = harmonize_country_names(df=df_land_use) + + # Add new variables to main dataframe (consumption-based emissions, emission intensity, per-capita emissions, etc.). + tb_combined = combine_data_and_add_variables( + df_co2=df_co2, + df_production=df_production, + df_consumption=df_consumption, + df_global_emissions=df_global_emissions, + df_land_use=df_land_use, + df_gdp=df_gdp, + df_energy=df_energy, + df_population=df_population, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + ) + + # Run sanity checks on output data. + sanity_checks_on_output_data(tb_combined) + + # + # Save outputs. + # + # Create a new garden dataset and use metadata from meadow dataset. + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_combined], default_metadata=ds_meadow.metadata) + + ds_garden.save() diff --git a/etl/steps/data/grapher/gcp/2023-07-10/global_carbon_budget.py b/etl/steps/data/grapher/gcp/2023-07-10/global_carbon_budget.py new file mode 100644 index 00000000000..78d07f6e953 --- /dev/null +++ b/etl/steps/data/grapher/gcp/2023-07-10/global_carbon_budget.py @@ -0,0 +1,81 @@ +"""Load a garden dataset and create a grapher dataset. + +Some auxiliary variables will be added (where nans are filled with zeros, to avoid missing data in stacked area charts). + +""" +from copy import deepcopy + +import numpy as np +import pandas as pd +from owid.catalog import Dataset + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# For two stacked area charts (namely "CO₂ emissions by fuel type" and "Cumulative CO₂ emissions by source") having +# nans in the data causes the chart to show only years where all sources have data. +# To avoid this, create additional variables that have nans filled with zeros. +VARIABLES_TO_FILL_WITH_ZEROS = [ + "emissions_total", + "emissions_from_cement", + "emissions_from_coal", + "emissions_from_flaring", + "emissions_from_gas", + "emissions_from_land_use_change", + "emissions_from_oil", + "emissions_from_other_industry", + "cumulative_emissions_total", + "cumulative_emissions_from_cement", + "cumulative_emissions_from_coal", + "cumulative_emissions_from_flaring", + "cumulative_emissions_from_gas", + "cumulative_emissions_from_land_use_change", + "cumulative_emissions_from_oil", + "cumulative_emissions_from_other_industry", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden: Dataset = paths.load_dependency("global_carbon_budget") + + # Read table from garden dataset. + tb_garden = ds_garden["global_carbon_budget"] + + # + # Process data. + # + # Ensure all countries span all years (from 1750 to the latest observation), even if many of those rows are empty. + # This will increase the size of the dataset, but we do this so that stacked area charts span the maximum possible + # range of years. + countries = tb_garden.reset_index()["country"].unique() + years = np.arange(tb_garden.reset_index()["year"].min(), tb_garden.reset_index()["year"].max() + 1, dtype=int) + tb_garden = tb_garden.reindex(pd.MultiIndex.from_product([countries, years], names=["country", "year"])) + + # Create additional variables in the table that have nans filled with zeros (for two specific stacked area charts). + for variable in VARIABLES_TO_FILL_WITH_ZEROS: + new_variable_name = variable + "_zero_filled" + tb_garden[new_variable_name] = tb_garden[variable].fillna(0) + tb_garden[new_variable_name].metadata = deepcopy(tb_garden[variable].metadata) + tb_garden[new_variable_name].metadata.title = tb_garden[variable].metadata.title + " (zero filled)" + tb_garden[new_variable_name].metadata.description = ( + tb_garden[variable].metadata.description + " Missing data has been filled with zeros for the purposes of " + "data visualization." + ) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + + # Sanity checks. + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/gcp/2023-07-10/global_carbon_budget.py b/etl/steps/data/meadow/gcp/2023-07-10/global_carbon_budget.py new file mode 100644 index 00000000000..06d59474b01 --- /dev/null +++ b/etl/steps/data/meadow/gcp/2023-07-10/global_carbon_budget.py @@ -0,0 +1,238 @@ +"""Load a snapshot and create a meadow dataset. + +It combines the following snapshots: +- GCP's Fossil CO2 emissions (long-format csv). +- GCP's official GCB global emissions (excel file) containing global bunker fuel and land-use change emissions. +- GCP's official GCB national emissions (excel file) containing consumption-based emissions for each country. + - Production-based emissions from this file are also used, but just to include total emissions of regions + according to GCP (e.g. "Africa (GCP)") and for sanity checks. +- GCP's official GCB national land-use change emissions (excel file) with land-use change emissions for each country. + +""" + +import pandas as pd +from owid.catalog import Table +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset +from etl.snapshot import Snapshot + +# Initialize logger. +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def prepare_fossil_co2(df_fossil_co2: pd.DataFrame) -> Table: + # Set an appropriate index and sort conveniently. + df_fossil_co2 = df_fossil_co2.set_index(["Country", "Year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Create a new table and ensure all columns are snake-case. + tb_fossil_co2 = Table(df_fossil_co2, short_name="global_carbon_budget_fossil_co2_emissions", underscore=True) + + return tb_fossil_co2 + + +def prepare_historical_budget(df_historical_budget: pd.DataFrame) -> Table: + """Select variables and prepare the historical budget sheet of GCB's raw global data file. + + Parameters + ---------- + df_historical_budget : pd.DataFrame + Historical budget sheet of GCB's raw global data file. + + Returns + ------- + tb_historical_budget : Table + Historical budget after selecting variables and processing them. + + """ + # Sanity check. + error = "'Historical Budget' sheet in global data file has changed (consider changing 'skiprows')." + assert df_historical_budget.columns[0] == "Year", error + + # Columns to select in historical budget and how to rename them. + columns = { + "Year": "year", + "fossil emissions excluding carbonation": "global_fossil_emissions", + "land-use change emissions": "global_land_use_change_emissions", + } + df_historical_budget = df_historical_budget[list(columns)].rename(columns=columns) + + # Add column for country (to be able to combine this with the national data). + df_historical_budget["country"] = "World" + + # Set an index and sort row and columns conveniently. + df_historical_budget = ( + df_historical_budget.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + ) + + # Create a table with the generated data. + tb_historical_budget = Table( + df_historical_budget, short_name="global_carbon_budget_historical_budget", underscore=True + ) + + return tb_historical_budget + + +def prepare_land_use_emissions(df_land_use: pd.DataFrame) -> Table: + """Prepare data from a specific sheet of the land-use change data file. + + Parameters + ---------- + df_land_use : pd.DataFrame + Data from a specific sheet of the land-use change emissions data file. + + Returns + ------- + tb_land_use : Table + Processed land-use change emissions data. + + """ + df_land_use = df_land_use.copy() + + # Sanity check. + error = "'BLUE' sheet in national land-use change data file has changed (consider changing 'skiprows')." + assert df_land_use.columns[1] == "Afghanistan", error + + # Extract quality flag from the zeroth row of the data. + # Ignore nans (which happen when a certain country has no data). + quality_flag = ( + df_land_use.drop(columns=df_land_use.columns[0]) + .loc[0] + .dropna() + .astype(int) + .to_frame("quality_flag") + .reset_index() + .rename(columns={"index": "country"}) + ) + + # Drop the first row, which is for quality factor (which we have already extracted). + df_land_use = df_land_use.rename(columns={df_land_use.columns[0]: "year"}).drop(0) + + # Ignore countries that have no data. + df_land_use = df_land_use.dropna(axis=1, how="all") + + # Restructure data to have a column for country and another for emissions. + df_land_use = df_land_use.melt(id_vars="year", var_name="country", value_name="emissions") + + error = "Countries with emissions data differ from countries with quality flag." + assert set(df_land_use["country"]) == set(quality_flag["country"]), error + + # Add quality factor as an additional column. + df_land_use = pd.merge(df_land_use, quality_flag, how="left", on="country") + + # Set an index and sort row and columns conveniently. + df_land_use = df_land_use.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Create a table with the generated data. + tb_land_use = Table(df_land_use, short_name="global_carbon_budget_land_use_change", underscore=True) + + return tb_land_use + + +def prepare_national_emissions(df: pd.DataFrame, column_name: str) -> Table: + """Select variables and prepare the territorial emissions (or the consumption emissions) sheet of GCB's raw national + data file. + + Parameters + ---------- + df : pd.DataFrame + Territorial emissions (or consumption emissions) sheet of GCB's raw national data file. + column_name : str + Name to assign to emissions column to be generated. + + Returns + ------- + tb_national : Table + Processed territorial (or consumption) emissions sheet of GCB's raw national data file. + + """ + df = df.copy() + + error = f"Sheet in national data file for {column_name} has changed (consider changing 'skiprows')." + assert df.columns[1] == "Afghanistan", error + + # The zeroth column is expected to be year. + df = df.rename(columns={df.columns[0]: "year"}) + + # Each column represents a country; then the final columns are regions, "Bunkers", and "Statistical Difference". + # Keep "Bunkers", but remove "Statistical Difference" (which is almost completely empty). + # In fact "Bunkers" is a global variable (I don't know why it is included at the national level), but this will be + # handled at the garden step. + + # Remove unnecessary column. + df = df.drop(columns=["Statistical Difference"]) + + # Convert from wide to long format dataframe. + df = df.melt(id_vars=["year"]).rename(columns={"variable": "country", "value": column_name}) + + # Set an index and sort row and columns conveniently. + df = df.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Create a table with the generated data. + tb_national = Table(df, short_name=f"global_carbon_budget_{column_name}", underscore=True) + + return tb_national + + +def run(dest_dir: str) -> None: + log.info("global_carbon_budget.start") + + # + # Load inputs. + # + # Retrieve snapshots. + snap_fossil_co2: Snapshot = paths.load_dependency("global_carbon_budget_fossil_co2_emissions.csv") + snap_global: Snapshot = paths.load_dependency("global_carbon_budget_global_emissions.xlsx") + snap_national: Snapshot = paths.load_dependency("global_carbon_budget_national_emissions.xlsx") + snap_land_use: Snapshot = paths.load_dependency("global_carbon_budget_land_use_change_emissions.xlsx") + + # Load data from fossil CO2 emissions. + df_fossil_co2 = pd.read_csv(snap_fossil_co2.path) + + # Load historical budget from the global emissions file. + df_historical = pd.read_excel(snap_global.path, sheet_name="Historical Budget", skiprows=15) + + # Load land-use emissions. + df_land_use = pd.read_excel(snap_land_use.path, sheet_name="BLUE", skiprows=7) + + # Load production-based national emissions. + df_production = pd.read_excel(snap_national.path, sheet_name="Territorial Emissions", skiprows=11) + + # Load consumption-based national emissions. + df_consumption = pd.read_excel(snap_national.path, sheet_name="Consumption Emissions", skiprows=8) + + # + # Process data. + # + # Prepare data for fossil CO2 emissions. + tb_fossil_co2 = prepare_fossil_co2(df_fossil_co2=df_fossil_co2) + + # Prepare data for historical emissions. + tb_historical = prepare_historical_budget(df_historical_budget=df_historical) + + # Prepare data for land-use emissions. + tb_land_use = prepare_land_use_emissions(df_land_use=df_land_use) + + # Prepare data for production-based emissions, from the file of national emissions. + tb_production = prepare_national_emissions(df=df_production, column_name="production_emissions") + + # Prepare data for consumption-based emissions, from the file of national emissions. + tb_consumption = prepare_national_emissions(df=df_consumption, column_name="consumption_emissions") + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, + tables=[tb_fossil_co2, tb_historical, tb_land_use, tb_production, tb_consumption], + default_metadata=snap_fossil_co2.metadata, + ) + + # Save changes in the new garden dataset. + ds_meadow.save() + + log.info("global_carbon_budget.end") From 36501b635a428c3d58e8a5a11c573afb3fa55062 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 12:14:54 +0200 Subject: [PATCH 03/16] feat(gcp): Update GCP dataset to use latest primary energy consumption --- dag/emissions.yml | 74 ++- .../2023-07-10/global_carbon_budget.meta.yml | 11 +- .../gcp/2023-07-10/global_carbon_budget.py | 573 ++++++++++-------- .../gcp/2023-07-10/global_carbon_budget.py | 42 +- .../gcp/2023-07-10/global_carbon_budget.py | 120 ++-- ...carbon_budget_fossil_co2_emissions.csv.dvc | 3 +- ...al_carbon_budget_global_emissions.xlsx.dvc | 2 +- ..._budget_land_use_change_emissions.xlsx.dvc | 2 +- ..._carbon_budget_national_emissions.xlsx.dvc | 2 +- 9 files changed, 430 insertions(+), 399 deletions(-) diff --git a/dag/emissions.yml b/dag/emissions.yml index 8ad1a96ad2b..e59e2f17a06 100644 --- a/dag/emissions.yml +++ b/dag/emissions.yml @@ -17,17 +17,6 @@ steps: data://grapher/cait/2022-08-10/n2o_emissions: - data://garden/cait/2022-08-10/ghg_emissions_by_sector # - # Emissions - CO2 dataset (2022). - # - data://garden/emissions/2023-05-03/owid_co2: - - data://garden/emissions/2023-05-02/national_contributions - - data://garden/gcp/2023-04-28/global_carbon_budget - - data://garden/cait/2022-08-10/ghg_emissions_by_sector - - data://garden/energy/2023-06-01/primary_energy_consumption - - data://garden/demography/2022-12-08/population - - data://garden/ggdc/2020-10-01/ggdc_maddison - - data://garden/regions/2023-01-01/regions - # # Andrew - CO2 mitigation curves (2019). # data://meadow/andrew/2019-12-03/co2_mitigation_curves: @@ -40,26 +29,37 @@ steps: data://grapher/andrew/2019-12-03/co2_mitigation_curves_2celsius: - data://garden/andrew/2019-12-03/co2_mitigation_curves # - # GCP - Global Carbon Budget (2023-04-28). + # GCP - Global Carbon Budget (2023-07-10). # - data://meadow/gcp/2023-04-28/global_carbon_budget: + data://meadow/gcp/2023-07-10/global_carbon_budget: - snapshot://gcp/2023-04-28/global_carbon_budget_fossil_co2_emissions.csv - snapshot://gcp/2023-04-28/global_carbon_budget_global_emissions.xlsx - snapshot://gcp/2023-04-28/global_carbon_budget_national_emissions.xlsx - snapshot://gcp/2023-04-28/global_carbon_budget_land_use_change_emissions.xlsx - data://garden/gcp/2023-04-28/global_carbon_budget: - - data://meadow/gcp/2023-04-28/global_carbon_budget + data://garden/gcp/2023-07-10/global_carbon_budget: + - data://meadow/gcp/2023-07-10/global_carbon_budget # Loaded to calculate emissions per unit energy. - - data://garden/energy/2023-06-01/primary_energy_consumption + - data://garden/energy/2023-07-10/primary_energy_consumption # Loaded to calculate emissions per GDP. - data://garden/ggdc/2020-10-01/ggdc_maddison # Loaded to create per-capita variables. - - data://garden/demography/2022-12-08/population + - data://garden/demography/2023-03-31/population # Loaded to create region aggregates (including income groups). - data://garden/regions/2023-01-01/regions - - data://garden/wb/2021-07-01/wb_income - data://grapher/gcp/2023-04-28/global_carbon_budget: - - data://garden/gcp/2023-04-28/global_carbon_budget + - data://garden/wb/2023-04-30/income_groups + data://grapher/gcp/2023-07-10/global_carbon_budget: + - data://garden/gcp/2023-07-10/global_carbon_budget + # + # Emissions - CO2 dataset (2023). + # + data://garden/emissions/2023-07-10/owid_co2: + - data://garden/emissions/2023-05-02/national_contributions + - data://garden/gcp/2023-07-10/global_carbon_budget + - data://garden/cait/2022-08-10/ghg_emissions_by_sector + - data://garden/energy/2023-07-10/primary_energy_consumption + - data://garden/demography/2023-03-31/population + - data://garden/ggdc/2020-10-01/ggdc_maddison + - data://garden/regions/2023-01-01/regions # # RFF - World Carbon Pricing (2022-10-11). # @@ -97,3 +97,37 @@ steps: - data://garden/demography/2022-12-08/population data://grapher/emissions/2023-05-02/national_contributions: - data://garden/emissions/2023-05-02/national_contributions + ###################################################################################################################### + # Older versions to be archived once they are not used by any other steps. + # + # GCP - Global Carbon Budget (2023-04-28). + # + data://meadow/gcp/2023-04-28/global_carbon_budget: + - snapshot://gcp/2023-04-28/global_carbon_budget_fossil_co2_emissions.csv + - snapshot://gcp/2023-04-28/global_carbon_budget_global_emissions.xlsx + - snapshot://gcp/2023-04-28/global_carbon_budget_national_emissions.xlsx + - snapshot://gcp/2023-04-28/global_carbon_budget_land_use_change_emissions.xlsx + data://garden/gcp/2023-04-28/global_carbon_budget: + - data://meadow/gcp/2023-04-28/global_carbon_budget + # Loaded to calculate emissions per unit energy. + - data://garden/energy/2023-06-01/primary_energy_consumption + # Loaded to calculate emissions per GDP. + - data://garden/ggdc/2020-10-01/ggdc_maddison + # Loaded to create per-capita variables. + - data://garden/demography/2022-12-08/population + # Loaded to create region aggregates (including income groups). + - data://garden/regions/2023-01-01/regions + - data://garden/wb/2021-07-01/wb_income + data://grapher/gcp/2023-04-28/global_carbon_budget: + - data://garden/gcp/2023-04-28/global_carbon_budget + # + # Emissions - CO2 dataset (2022). + # + data://garden/emissions/2023-05-03/owid_co2: + - data://garden/emissions/2023-05-02/national_contributions + - data://garden/gcp/2023-04-28/global_carbon_budget + - data://garden/cait/2022-08-10/ghg_emissions_by_sector + - data://garden/energy/2023-06-01/primary_energy_consumption + - data://garden/demography/2022-12-08/population + - data://garden/ggdc/2020-10-01/ggdc_maddison + - data://garden/regions/2023-01-01/regions diff --git a/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml index d69c0584f73..1cfb103d802 100644 --- a/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml +++ b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml @@ -1,5 +1,5 @@ dataset: - title: Global Carbon Budget (Global Carbon Project, 2023) + title: Global Carbon Budget (Global Carbon Project, 2023b) description: | The Global Carbon Budget dataset is available here. @@ -19,15 +19,6 @@ dataset: Our World in Data have renamed bunker fuels as "International transport" for improved clarity, which includes emissions from international aviation and shipping. Emissions from the Kuwaiti oil fires in 1991 have been included as part of Kuwait's emissions for that year. - licenses: - - name: Creative Commons Attribution 4.0 International - url: https://zenodo.org/record/7215364 - sources: - - name: Our World in Data based on the Global Carbon Project (2023) - published_by: "Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Alkama, R., Arneth, A., Arora, V. K., Bates, N. R., Becker, M., Bellouin, N., Bittig, H. C., Bopp, L., Chevallier, F., Chini, L. P., Cronin, M., Evans, W., Falk, S., Feely, R. A., Gasser, T., Gehlen, M., Gkritzalis, T., Gloege, L., Grassi, G., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jain, A. K., Jersild, A., Kadono, K., Kato, E., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Landschützer, P., Lefèvre, N., Lindsay, K., Liu, J., Liu, Z., Marland, G., Mayot, N., McGrath, M. J., Metzl, N., Monacci, N. M., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K., Ono, T., Palmer, P. I., Pan, N., Pierrot, D., Pocock, K., Poulter, B., Resplandy, L., Robertson, E., Rödenbeck, C., Rodriguez, C., Rosan, T. M., Schwinger, J., Séférian, R., Shutler, J. D., Skjelvan, I., Steinhoff, T., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tanhua, T., Tans, P. P., Tian, X., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., Walker, A. P., Wanninkhof, R., Whitehead, C., Willstrand Wranne, A., Wright, R., Yuan, W., Yue, C., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2022, Earth Syst. Sci. Data, 14, 4811-4900, https://doi.org/10.5194/essd-14-4811-2022, 2022." - url: https://www.globalcarbonproject.org/ - date_accessed: 2023-04-28 - tables: global_carbon_budget: variables: diff --git a/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.py b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.py index f6fb09796e3..319d165e6fe 100644 --- a/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.py +++ b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.py @@ -10,7 +10,7 @@ """ import numpy as np -import pandas as pd +import owid.catalog.processing as pr from owid.catalog import Dataset, Table from owid.datautils import dataframes from structlog import get_logger @@ -101,13 +101,6 @@ "primary_energy_consumption__twh": "primary_energy_consumption", } -# Columns to use from GDP data and how to rename them. -GDP_COLUMNS = { - "country": "country", - "year": "year", - "gdp": "gdp", -} - # Columns to use from primary energy consumption data and how to rename them. HISTORICAL_EMISSIONS_COLUMNS = { "country": "country", @@ -156,43 +149,43 @@ def sanity_checks_on_input_data( - df_production: pd.DataFrame, df_consumption: pd.DataFrame, df_historical: pd.DataFrame, df_co2: pd.DataFrame + tb_production: Table, tb_consumption: Table, tb_historical: Table, tb_co2: Table ) -> None: """Run sanity checks on input data files. - These checks should be used prior to country harmonization, but after basic processing of the dataframes. + These checks should be used prior to country harmonization, but after basic processing of the tables. Parameters ---------- - df_production : pd.DataFrame + tb_production : Table Production-based emissions from GCP's official national emissions dataset (excel file). - df_consumption : pd.DataFrame + tb_consumption : Table Consumption-based emissions from GCP's official national emissions dataset (excel file). - df_historical : pd.DataFrame + tb_historical : Table Historical emissions from GCP's official global emissions dataset (excel file). - df_co2 : pd.DataFrame + tb_co2 : Table Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). """ - df_production = df_production.copy() - df_consumption = df_consumption.copy() - df_historical = df_historical.copy() - df_co2 = df_co2.copy() + tb_production = tb_production.copy() + tb_consumption = tb_consumption.copy() + tb_historical = tb_historical.copy() + tb_co2 = tb_co2.copy() # In the original data, Bunkers was included in the national data file, as another country. # But I suppose it should be considered as another kind of global emission. # In fact, bunker emissions should coincide for production and consumption emissions. global_bunkers_emissions = ( - df_production[df_production["country"] == "Bunkers"][["year", "production_emissions"]] + tb_production[tb_production["country"] == "Bunkers"][["year", "production_emissions"]] .reset_index(drop=True) .rename(columns={"production_emissions": "global_bunker_emissions"}, errors="raise") ) - # Check that we get exactly the same array of bunker emissions from the consumption emissions dataframe + # Check that we get exactly the same array of bunker emissions from the consumption emissions table # (on years where there is data for bunker emissions in both datasets). - comparison = pd.merge( + comparison = pr.merge( global_bunkers_emissions, - df_consumption[df_consumption["country"] == "Bunkers"][["year", "consumption_emissions"]] + tb_consumption[tb_consumption["country"] == "Bunkers"][["year", "consumption_emissions"]] .reset_index(drop=True) .rename(columns={"consumption_emissions": "global_bunker_emissions"}, errors="raise"), how="inner", @@ -200,35 +193,35 @@ def sanity_checks_on_input_data( suffixes=("", "_check"), ) - error = "Bunker emissions were expected to coincide in production and consumption emissions dataframes." + error = "Bunker emissions were expected to coincide in production and consumption emissions tables." assert (comparison["global_bunker_emissions"] == comparison["global_bunker_emissions_check"]).all(), error # Check that all production-based emissions are positive. - error = "There are negative emissions in df_production (from the additional variables dataset)." - assert (df_production.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + error = "There are negative emissions in tb_production (from the additional variables dataset)." + assert (tb_production.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error # Check that all production-based emissions from the fossil CO2 dataset are positive. - error = "There are negative emissions in df_co2 (from the fossil CO2 dataset)." - assert (df_co2.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + error = "There are negative emissions in tb_co2 (from the fossil CO2 dataset)." + assert (tb_co2.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error # Check that all consumption-based emissions are positive. - error = "There are negative emissions in df_consumption (from the national emissions dataset)." - assert (df_consumption.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + error = "There are negative emissions in tb_consumption (from the national emissions dataset)." + assert (tb_consumption.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error # Check that, for the World, production emissions coincides with consumption emissions (on common years). error = "Production and consumption emissions for the world were expected to be identical." - comparison = pd.merge( - df_production[df_production["country"] == "World"].reset_index(drop=True), - df_consumption[df_consumption["country"] == "World"].reset_index(drop=True), + comparison = pr.merge( + tb_production[tb_production["country"] == "World"].reset_index(drop=True), + tb_consumption[tb_consumption["country"] == "World"].reset_index(drop=True), how="inner", on="year", ) assert (comparison["production_emissions"] == comparison["consumption_emissions"]).all(), error # Check that production emissions for the World coincide with global (historical) emissions (on common years). - comparison = pd.merge( - df_production[df_production["country"] == "World"][["year", "production_emissions"]].reset_index(drop=True), - df_historical[["year", "global_fossil_emissions"]], + comparison = pr.merge( + tb_production[tb_production["country"] == "World"][["year", "production_emissions"]].reset_index(drop=True), + tb_historical[["year", "global_fossil_emissions"]], how="inner", on="year", ) @@ -239,16 +232,16 @@ def sanity_checks_on_input_data( < 0.001 ).all(), error - # Check that emissions in df_production (emissions from the national excel file) coincide with emissions in df_co2 + # Check that emissions in tb_production (emissions from the national excel file) coincide with emissions in tb_co2 # (from the Fossil CO2 emissions csv file). # Given that country names have not yet been harmonized, rename the only countries that are present in both datasets. - comparison = pd.merge( - df_co2[["country", "year", "emissions_total"]], - df_production.replace({"Bunkers": "International Transport", "World": "Global"}), + comparison = pr.merge( + tb_co2[["country", "year", "emissions_total"]], + tb_production.replace({"Bunkers": "International Transport", "World": "Global"}), on=["country", "year"], how="inner", ).dropna(subset=["emissions_total", "production_emissions"], how="any") - # Since we included the emissions from the Kuwaiti oil fires in Kuwait (and they are not included in df_production), + # Since we included the emissions from the Kuwaiti oil fires in Kuwait (and they are not included in tb_production), # omit that row in the comparison. comparison = comparison.drop( comparison[(comparison["country"] == "Kuwait") & (comparison["year"] == 1991)].index @@ -265,53 +258,53 @@ def sanity_checks_on_input_data( ).all(), error -def sanity_checks_on_output_data(combined_df: pd.DataFrame) -> None: +def sanity_checks_on_output_data(tb_combined: Table) -> None: """Run sanity checks on output data. - These checks should be run on the very final output dataframe (with an index) prior to storing it as a table. + These checks should be run on the very final output table (with an index) prior to storing it as a table. Parameters ---------- - combined_df : pd.DataFrame - Combination of all input dataframes, after processing, harmonization, and addition of variables. + combined_df : Table + Combination of all input tables, after processing, harmonization, and addition of variables. """ - combined_df = combined_df.reset_index() + tb_combined = tb_combined.reset_index() error = "All variables (except traded emissions, growth, and land-use change) should be >= 0 or nan." positive_variables = [ col - for col in combined_df.columns + for col in tb_combined.columns if col != "country" if "traded" not in col if "growth" not in col if "land_use" not in col ] - assert (combined_df[positive_variables].fillna(0) >= 0).all().all(), error + assert (tb_combined[positive_variables].fillna(0) >= 0).all().all(), error error = "Production emissions as a share of global emissions should be 100% for 'World' (within 2% error)." - assert combined_df[ - (combined_df["country"] == "World") & (abs(combined_df["emissions_total_as_share_of_global"] - 100) > 2) + assert tb_combined[ + (tb_combined["country"] == "World") & (abs(tb_combined["emissions_total_as_share_of_global"] - 100) > 2) ].empty, error error = "Consumption emissions as a share of global emissions should be 100% for 'World' (within 2% error)." - assert combined_df[ - (combined_df["country"] == "World") & (abs(combined_df["consumption_emissions_as_share_of_global"] - 100) > 2) + assert tb_combined[ + (tb_combined["country"] == "World") & (abs(tb_combined["consumption_emissions_as_share_of_global"] - 100) > 2) ].empty, error error = "Population as a share of global population should be 100% for 'World'." - assert combined_df[ - (combined_df["country"] == "World") & (combined_df["population_as_share_of_global"].fillna(100) != 100) + assert tb_combined[ + (tb_combined["country"] == "World") & (tb_combined["population_as_share_of_global"].fillna(100) != 100) ].empty, error error = "All share of global emissions should be smaller than 100% (within 2% error)." - share_variables = [col for col in combined_df.columns if "share" in col] - assert (combined_df[share_variables].fillna(0) <= 102).all().all(), error + share_variables = [col for col in tb_combined.columns if "share" in col] + assert (tb_combined[share_variables].fillna(0) <= 102).all().all(), error # Check that cumulative variables are monotonically increasing. # Firstly, list columns of cumulative variables, but ignoring cumulative columns as a share of global # (since they are not necessarily monotonic) and land-use change (which can be negative). cumulative_cols = [ - col for col in combined_df.columns if "cumulative" in col if "share" not in col if "land_use" not in col + col for col in tb_combined.columns if "cumulative" in col if "share" not in col if "land_use" not in col ] # Using ".is_monotonic_increasing" can fail when differences between consecutive numbers are very small. # Instead, sort data backwards in time, and check that consecutive values of cumulative variables always have @@ -321,7 +314,7 @@ def sanity_checks_on_output_data(combined_df: pd.DataFrame) -> None: "including land-use change emissions, which can be negative)." ) assert ( - combined_df.sort_values("year", ascending=False) + tb_combined.sort_values("year", ascending=False) .groupby("country") .agg({col: lambda x: ((x.pct_change().dropna() * 100) <= 0.1).all() for col in cumulative_cols}) .all() @@ -333,28 +326,28 @@ def sanity_checks_on_output_data(combined_df: pd.DataFrame) -> None: "(or larger than 98%, given small discrepancies)." ) # Consumption emissions as a share of global production emissions is allowed to be smaller than 100%. - share_variables = [col for col in combined_df.columns if "share" in col if "consumption" not in col] - assert (combined_df[combined_df["country"] == "World"][share_variables].fillna(100) > 98).all().all(), error + share_variables = [col for col in tb_combined.columns if "share" in col if "consumption" not in col] + assert (tb_combined[tb_combined["country"] == "World"][share_variables].fillna(100) > 98).all().all(), error error = "Traded emissions for the World should be close to zero (within 2% error)." - world_mask = combined_df["country"] == "World" + world_mask = tb_combined["country"] == "World" assert ( abs( 100 - * combined_df[world_mask]["traded_emissions"].fillna(0) - / combined_df[world_mask]["emissions_total"].fillna(1) + * tb_combined[world_mask]["traded_emissions"].fillna(0) + / tb_combined[world_mask]["emissions_total"].fillna(1) ) < 2 ).all(), error -def prepare_fossil_co2_emissions(df_co2: pd.DataFrame) -> pd.DataFrame: +def prepare_fossil_co2_emissions(tb_co2: Table) -> Table: """Prepare Fossil CO2 emissions data (basic processing).""" # Select and rename columns from fossil CO2 data. - df_co2 = df_co2[list(CO2_COLUMNS)].rename(columns=CO2_COLUMNS, errors="raise") + tb_co2 = tb_co2[list(CO2_COLUMNS)].rename(columns=CO2_COLUMNS, errors="raise") # Ensure all emissions are given in tonnes of CO2. - df_co2[EMISSION_SOURCES] *= MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 + tb_co2[EMISSION_SOURCES] *= MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 #################################################################################################################### # NOTE: For certain years, column "emissions_from_other_industry" is not informed for "World" but it is informed @@ -365,13 +358,13 @@ def prepare_fossil_co2_emissions(df_co2: pd.DataFrame) -> pd.DataFrame: # data is missing (without touching other years or other columns). # Firstly, list of years for which the world has no data for emissions_from_other_industry. world_missing_years = ( - df_co2[(df_co2["country"] == "Global") & (df_co2["emissions_from_other_industry"].isnull())]["year"] + tb_co2[(tb_co2["country"] == "Global") & (tb_co2["emissions_from_other_industry"].isnull())]["year"] .unique() .tolist() # type: ignore ) # Data that needs to be aggregated. - data_missing_in_world = df_co2[ - df_co2["year"].isin(world_missing_years) & (df_co2["emissions_from_other_industry"].notnull()) + data_missing_in_world = tb_co2[ + tb_co2["year"].isin(world_missing_years) & (tb_co2["emissions_from_other_industry"].notnull()) ] # Check that there is indeed data to be aggregated (that is missing for the World). error = ( @@ -379,66 +372,68 @@ def prepare_fossil_co2_emissions(df_co2: pd.DataFrame) -> pd.DataFrame: "(which was an issue in the original fossil CO2 data). The issue may be fixed and the code can be simplified." ) assert len(data_missing_in_world) > 0, error - # Create a dataframe of aggregate data for the World, on those years when it's missing. + # Create a table of aggregate data for the World, on those years when it's missing. aggregated_missing_data = ( data_missing_in_world.groupby("year") .agg({"emissions_from_other_industry": "sum"}) .reset_index() .assign(**{"country": "Global"}) ) - # Combine the new dataframe of aggregate data with the main dataframe. - df_co2 = dataframes.combine_two_overlapping_dataframes( - df1=df_co2, df2=aggregated_missing_data, index_columns=["country", "year"], keep_column_order=True + # Combine the new table of aggregate data with the main table. + tb_co2 = dataframes.combine_two_overlapping_dataframes( + df1=tb_co2, df2=aggregated_missing_data, index_columns=["country", "year"], keep_column_order=True ) + # NOTE: The previous function currently does not properly propagate metadata, but keeps only the sources of the + # first table. But given that both tables combined have the same source, we don't need to manually change it. #################################################################################################################### # We add the emissions from "Kuwaiti Oil Fires" (which is also included as a separate country) as part of the # emissions of Kuwait. This ensures that they will be included in region aggregates. error = "'Kuwaiti Oil Fires' was expected to only have not-null data for 1991." - assert df_co2[ - (df_co2["country"] == "Kuwaiti Oil Fires") - & (df_co2["emissions_total"].notnull()) - & (df_co2["emissions_total"] != 0) + assert tb_co2[ + (tb_co2["country"] == "Kuwaiti Oil Fires") + & (tb_co2["emissions_total"].notnull()) + & (tb_co2["emissions_total"] != 0) ]["year"].tolist() == [1991], error - df_co2.loc[(df_co2["country"] == "Kuwait") & (df_co2["year"] == 1991), EMISSION_SOURCES] = ( - df_co2[(df_co2["country"] == "Kuwaiti Oil Fires") & (df_co2["year"] == 1991)][EMISSION_SOURCES].values - + df_co2[(df_co2["country"] == "Kuwait") & (df_co2["year"] == 1991)][EMISSION_SOURCES].values + tb_co2.loc[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991), EMISSION_SOURCES] = ( + tb_co2[(tb_co2["country"] == "Kuwaiti Oil Fires") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values + + tb_co2[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values ) # Check that "emissions_total" agrees with the sum of emissions from individual sources. error = "The sum of all emissions should add up to total emissions (within 1%)." assert ( abs( - df_co2.drop(columns=["country", "year", "emissions_total"]).sum(axis=1) - - df_co2["emissions_total"].fillna(0) + tb_co2.drop(columns=["country", "year", "emissions_total"]).sum(axis=1) + - tb_co2["emissions_total"].fillna(0) ) - / (df_co2["emissions_total"].fillna(0) + 1e-7) + / (tb_co2["emissions_total"].fillna(0) + 1e-7) < 1e-2 ).all(), error # Many rows have zero total emissions, but actually the individual sources are nan. # Total emissions in those cases should be nan, instead of zero. - no_individual_emissions = df_co2.drop(columns=["country", "year", "emissions_total"]).isnull().all(axis=1) - df_co2.loc[no_individual_emissions, "emissions_total"] = np.nan + no_individual_emissions = tb_co2.drop(columns=["country", "year", "emissions_total"]).isnull().all(axis=1) + tb_co2.loc[no_individual_emissions, "emissions_total"] = np.nan - return df_co2 + return tb_co2 -def prepare_consumption_emissions(df_consumption: pd.DataFrame) -> pd.DataFrame: +def prepare_consumption_emissions(tb_consumption: Table) -> Table: """Prepare consumption-based emissions data (basic processing).""" # Select and rename columns. - df_consumption = df_consumption[list(CONSUMPTION_EMISSIONS_COLUMNS)].rename( + tb_consumption = tb_consumption[list(CONSUMPTION_EMISSIONS_COLUMNS)].rename( columns=CONSUMPTION_EMISSIONS_COLUMNS, errors="raise" ) # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in df_consumption.drop(columns=["country", "year"]).columns: - df_consumption[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + for column in tb_consumption.drop(columns=["country", "year"]).columns: + tb_consumption[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - # List indexes of rows in df_consumption corresponding to outliers (defined above in OUTLIERS_IN_df_consumption). + # List indexes of rows in tb_consumption corresponding to outliers (defined above in OUTLIERS_IN_tb_consumption). outlier_indexes = [ - df_consumption[(df_consumption["country"] == outlier[0]) & (df_consumption["year"] == outlier[1])].index.item() + tb_consumption[(tb_consumption["country"] == outlier[0]) & (tb_consumption["year"] == outlier[1])].index.item() for outlier in OUTLIERS_IN_CONSUMPTION_DF ] @@ -446,69 +441,71 @@ def prepare_consumption_emissions(df_consumption: pd.DataFrame) -> pd.DataFrame: "Outliers were expected to have negative consumption emissions. " "Maybe outliers have been fixed (and should be removed from the code)." ) - assert (df_consumption.loc[outlier_indexes]["consumption_emissions"] < 0).all(), error + assert (tb_consumption.loc[outlier_indexes]["consumption_emissions"] < 0).all(), error # Remove outliers. - df_consumption = df_consumption.drop(outlier_indexes).reset_index(drop=True) + tb_consumption = tb_consumption.drop(outlier_indexes).reset_index(drop=True) - return df_consumption + return tb_consumption -def prepare_production_emissions(df_production: pd.DataFrame) -> pd.DataFrame: +def prepare_production_emissions(tb_production: Table) -> Table: """Prepare production-based emissions data (basic processing).""" # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in df_production.drop(columns=["country", "year"]).columns: - df_production[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + for column in tb_production.drop(columns=["country", "year"]).columns: + tb_production[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - return df_production + return tb_production -def prepare_land_use_emissions(df_land_use: pd.DataFrame) -> pd.DataFrame: +def prepare_land_use_emissions(tb_land_use: Table) -> Table: """Prepare land-use change emissions data (basic processing).""" # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. - df_land_use["emissions"] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + tb_land_use["emissions"] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - return df_land_use + return tb_land_use -def prepare_historical_emissions(df_historical: pd.DataFrame) -> pd.DataFrame: +def prepare_historical_emissions(tb_historical: Table) -> Table: """Prepare historical emissions data.""" # Select and rename columns from historical emissions data. - df_historical = df_historical[list(HISTORICAL_EMISSIONS_COLUMNS)].rename( + tb_historical = tb_historical[list(HISTORICAL_EMISSIONS_COLUMNS)].rename( columns=HISTORICAL_EMISSIONS_COLUMNS, errors="raise" ) # Convert units from gigatonnes of carbon per year emissions to tonnes of CO2 per year. - for column in df_historical.drop(columns=["country", "year"]).columns: - df_historical[column] *= BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + for column in tb_historical.drop(columns=["country", "year"]).columns: + tb_historical[column] *= BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 - return df_historical + return tb_historical -def extract_global_emissions(df_co2: pd.DataFrame, df_historical: pd.DataFrame) -> pd.DataFrame: +def extract_global_emissions(tb_co2: Table, tb_historical: Table, ds_population: Dataset) -> Table: """Extract World emissions by combining data from the Fossil CO2 emissions and the global emissions dataset. The resulting global emissions data includes bunker and land-use change emissions. - NOTE: This function has to be used after selecting and renaming columns in df_co2, but before harmonizing country - names in df_co2 (so that "International Transport" is still listed as a country). + NOTE: This function has to be used after selecting and renaming columns in tb_co2, but before harmonizing country + names in tb_co2 (so that "International Transport" is still listed as a country). Parameters ---------- - df_co2 : pd.DataFrame + tb_co2 : Table Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). - df_historical : pd.DataFrame + tb_historical : Table Historical emissions from GCP's official global emissions dataset (excel file). + ds_population : Dataset + Population dataset. Returns ------- - global_emissions : pd.DataFrame + global_emissions : Table World emissions. """ # For some reason, "International Transport" is included as another country, that only has emissions from oil. # We separate it as another variable (only given at the global level). - global_transport = df_co2[df_co2["country"] == INTERNATIONAL_TRANSPORT_LABEL].reset_index(drop=True) + global_transport = tb_co2[tb_co2["country"] == INTERNATIONAL_TRANSPORT_LABEL].reset_index(drop=True) # Check that total emissions for international transport coincide with oil emissions. error = "Total emissions from international transport do not coincide with oil emissions." @@ -521,20 +518,20 @@ def extract_global_emissions(df_co2: pd.DataFrame, df_historical: pd.DataFrame) .rename(columns={"emissions_from_oil": "global_emissions_from_international_transport"}, errors="raise") ) - # Create a new dataframe of global emissions. + # Create a new table of global emissions. global_emissions = ( - df_co2[df_co2["country"].isin(["Global", "World"])][["year"] + EMISSION_SOURCES] + tb_co2[tb_co2["country"].isin(["Global", "World"])][["year"] + EMISSION_SOURCES] .rename(columns={column: f"global_{column}" for column in EMISSION_SOURCES}, errors="raise") .sort_values("year") .reset_index(drop=True) ) # Add bunker fuels to global emissions. - global_emissions = pd.merge(global_emissions, global_transport, on=["year"], how="outer") + global_emissions = pr.merge(global_emissions, global_transport, on=["year"], how="outer") - # Add historical land-use change emissions to dataframe of global emissions. - global_emissions = pd.merge( - global_emissions, df_historical[["year", "global_emissions_from_land_use_change"]], how="left", on="year" + # Add historical land-use change emissions to table of global emissions. + global_emissions = pr.merge( + global_emissions, tb_historical[["year", "global_emissions_from_land_use_change"]], how="left", on="year" ) # Add variable of total emissions including fossil fuels and land use change. @@ -550,23 +547,25 @@ def extract_global_emissions(df_co2: pd.DataFrame, df_historical: pd.DataFrame) global_emissions["country"] = "World" # Add global population. - global_emissions = geo.add_population_to_dataframe(df=global_emissions, population_col="global_population") + global_emissions = geo.add_population_to_table( + tb=global_emissions, ds_population=ds_population, population_col="global_population" + ) return global_emissions -def harmonize_country_names(df: pd.DataFrame) -> pd.DataFrame: +def harmonize_country_names(df: Table) -> Table: """Harmonize country names, and fix known issues with certain regions. Parameters ---------- - df : pd.DataFrame + df : Table Emissions data (either from the fossil CO2, the production-based, consumption-based, or land-use emissions datasets). Returns ------- - df : pd.DataFrame + df : Table Emissions data after harmonizing country names. """ @@ -597,38 +596,38 @@ def harmonize_country_names(df: pd.DataFrame) -> pd.DataFrame: def combine_data_and_add_variables( - df_co2: pd.DataFrame, - df_production: pd.DataFrame, - df_consumption: pd.DataFrame, - df_global_emissions: pd.DataFrame, - df_land_use: pd.DataFrame, - df_gdp: pd.DataFrame, - df_energy: pd.DataFrame, - df_population: pd.DataFrame, + tb_co2: Table, + tb_production: Table, + tb_consumption: Table, + tb_global_emissions: Table, + tb_land_use: Table, + tb_energy: Table, + ds_gdp: Dataset, + ds_population: Table, ds_regions: Dataset, ds_income_groups: Dataset, ) -> Table: - """Combine all relevant data into one dataframe, add region aggregates, and add custom variables (e.g. emissions per + """Combine all relevant data into one table, add region aggregates, and add custom variables (e.g. emissions per capita). Parameters ---------- - df_co2 : pd.DataFrame + tb_co2 : Table Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file), after harmonization. - df_production : pd.DataFrame + tb_production : Table Production-based emissions from GCP's official national emissions dataset (excel file), after harmonization. - df_consumption : pd.DataFrame + tb_consumption : Table Consumption-based emissions from GCP's official national emissions dataset (excel file), after harmonization. - df_global_emissions : pd.DataFrame + tb_global_emissions : Table World emissions (including bunker and land-use change emissions). - df_land_use : pd.DataFrame + tb_land_use : Table National land-use change emissions from GCP's official dataset (excel file), after harmonization. - df_gdp : pd.DataFrame - GDP data. - df_energy : pd.DataFrame + tb_energy : Table Primary energy data. - df_population : pd.DataFrame - Population data. + ds_gdp : Dataset + GDP dataset. + ds_population : Dataset + Population dataset. ds_regions : Dataset Regions dataset. ds_income_groups : Dataset @@ -636,48 +635,52 @@ def combine_data_and_add_variables( Returns ------- - tb_combined : Table + tb_co2_with_regions : Table Combined data, with all additional variables and with region aggregates. """ + tb_co2_with_regions = tb_co2.copy() + # Add region aggregates that were included in the national emissions file, but not in the Fossil CO2 emissions dataset. - gcp_aggregates = sorted(set(df_production["country"]) - set(df_co2["country"])) - df_co2 = pd.concat( + gcp_aggregates = sorted(set(tb_production["country"]) - set(tb_co2_with_regions["country"])) + tb_co2_with_regions = pr.concat( [ - df_co2, - df_production[df_production["country"].isin(gcp_aggregates)] + tb_co2_with_regions, + tb_production[tb_production["country"].isin(gcp_aggregates)] .rename(columns={"production_emissions": "emissions_total"}) .astype({"year": int}), ], ignore_index=True, ).reset_index(drop=True) - # Add consumption emissions to main dataframe (keep only the countries of the main dataframe). - # Given that additional GCP regions (e.g. "Africa (GCP)") have already been added to df_co2 - # (when merging with df_production), all countries from df_consumption should be included in df_co2. - error = "Some countries in df_consumption are not included in df_co2." - assert set(df_consumption["country"]) < set(df_co2["country"]), error - df_co2 = pd.merge(df_co2, df_consumption, on=["country", "year"], how="outer") + # Add consumption emissions to main table (keep only the countries of the main table). + # Given that additional GCP regions (e.g. "Africa (GCP)") have already been added to tb_co2 + # (when merging with tb_production), all countries from tb_consumption should be included in tb_co2. + error = "Some countries in tb_consumption are not included in tb_co2." + assert set(tb_consumption["country"]) < set(tb_co2_with_regions["country"]), error + tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_consumption, on=["country", "year"], how="outer") - # Add population to original dataframe. - df_co2 = pd.merge(df_co2, df_population[["country", "year", "population"]], on=["country", "year"], how="left") + # Add population to original table. + tb_co2_with_regions = geo.add_population_to_table( + tb=tb_co2_with_regions, ds_population=ds_population, warn_on_missing_countries=False + ) - # Add GDP to main dataframe. - df_co2 = pd.merge(df_co2, df_gdp, on=["country", "year"], how="left") + # Add GDP to main table. + tb_co2_with_regions = geo.add_gdp_to_table(tb=tb_co2_with_regions, ds_gdp=ds_gdp) - # Add primary energy to main dataframe. - df_co2 = pd.merge(df_co2, df_energy, on=["country", "year"], how="left") + # Add primary energy to main table. + tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_energy, on=["country", "year"], how="left") # For convenience, rename columns in land-use change emissions data. - df_land_use = df_land_use.rename( + tb_land_use = tb_land_use.rename( columns={"emissions": "emissions_from_land_use_change", "quality_flag": "land_use_change_quality_flag"} ) # Land-use change data does not include data for the World. Include it by merging with the global dataset. - df_land_use = pd.concat( + tb_land_use = pr.concat( [ - df_land_use, - df_global_emissions.rename( + tb_land_use, + tb_global_emissions.rename( columns={"global_emissions_from_land_use_change": "emissions_from_land_use_change"} )[["year", "emissions_from_land_use_change"]] .dropna() @@ -686,19 +689,24 @@ def combine_data_and_add_variables( ignore_index=True, ).astype({"year": int}) - # Add land-use change emissions to main dataframe. - df_co2 = pd.merge(df_co2, df_land_use, on=["country", "year"], how="outer") + # Add land-use change emissions to main table. + tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_land_use, on=["country", "year"], how="outer") # Add total emissions (including land-use change) for each country. - df_co2["emissions_total_including_land_use_change"] = ( - df_co2["emissions_total"] + df_co2["emissions_from_land_use_change"] + tb_co2_with_regions["emissions_total_including_land_use_change"] = ( + tb_co2_with_regions["emissions_total"] + tb_co2_with_regions["emissions_from_land_use_change"] ) + # Create a copy of the current table, to be able to copy its metadata after adding region aggregates. + _tb_co2_with_regions = tb_co2_with_regions.copy() + # Add region aggregates. # Aggregate not only emissions data, but also population, gdp and primary energy. # This way we ensure that custom regions (e.g. "North America (excl. USA)") will have all required data. aggregations = { - column: "sum" for column in df_co2.columns if column not in ["country", "year", "land_use_change_quality_flag"] + column: "sum" + for column in tb_co2_with_regions.columns + if column not in ["country", "year", "land_use_change_quality_flag"] } for region in REGIONS: countries_in_region = geo.list_members_of_region( @@ -709,9 +717,10 @@ def combine_data_and_add_variables( excluded_regions=REGIONS[region].get("excluded_regions", None), additional_members=REGIONS[region].get("additional_members", None), excluded_members=REGIONS[region].get("excluded_members", None), + include_historical_regions_in_income_groups=True, ) - df_co2 = geo.add_region_aggregates( - df=df_co2, + tb_co2_with_regions = geo.add_region_aggregates( + df=tb_co2_with_regions, region=region, countries_in_region=countries_in_region, countries_that_must_have_data=[], @@ -719,17 +728,24 @@ def combine_data_and_add_variables( aggregations=aggregations, ) - # Add global emissions and global cumulative emissions columns to main dataframe. - df_co2 = pd.merge(df_co2, df_global_emissions.drop(columns="country"), on=["year"], how="left") + # NOTE: The previous operation does not preserve metadata. Copy metadata of original table. + tb_co2_with_regions = tb_co2_with_regions.copy_metadata(from_table=_tb_co2_with_regions) - # Ensure main dataframe is sorted (so that cumulative emissions are properly calculated). - df_co2 = df_co2.sort_values(["country", "year"]).reset_index(drop=True) + # Add global emissions and global cumulative emissions columns to main table. + tb_co2_with_regions = pr.merge( + tb_co2_with_regions, tb_global_emissions.drop(columns="country"), on=["year"], how="left" + ) + + # Ensure main table is sorted (so that cumulative emissions are properly calculated). + tb_co2_with_regions = tb_co2_with_regions.sort_values(["country", "year"]).reset_index(drop=True) # Temporarily add certain global emissions variables. # This is done simply to be able to consider "consumption_emissions" as just another type of emission # when creating additional variables. - df_co2["global_consumption_emissions"] = df_co2["global_emissions_total"] - df_co2["global_cumulative_consumption_emissions"] = df_co2["global_cumulative_emissions_total"] + tb_co2_with_regions["global_consumption_emissions"] = tb_co2_with_regions["global_emissions_total"] + tb_co2_with_regions["global_cumulative_consumption_emissions"] = tb_co2_with_regions[ + "global_cumulative_emissions_total" + ] # Add new variables for each source of emissions. for column in EMISSION_SOURCES + [ @@ -738,106 +754,141 @@ def combine_data_and_add_variables( "emissions_total_including_land_use_change", ]: # Add per-capita variables. - df_co2[f"{column}_per_capita"] = df_co2[column] / df_co2["population"] + tb_co2_with_regions[f"{column}_per_capita"] = tb_co2_with_regions[column] / tb_co2_with_regions["population"] # Add columns for cumulative emissions. # Rows that had nan emissions will have nan cumulative emissions. # But nans will not be propagated in the sum. # This means that countries with some (not all) nans will have the cumulative sum of the informed emissions # (treating nans as zeros), but will have nan on those rows that were not informed. - df_co2[f"cumulative_{column}"] = df_co2.groupby(["country"])[column].cumsum() + # NOTE: Currently, this operation doesn't propagate metadata properly. This has to be done manually. + tb_co2_with_regions[f"cumulative_{column}"] = ( + tb_co2_with_regions.groupby(["country"])[column].cumsum().copy_metadata(tb_co2_with_regions[column]) + ) # Add share of global emissions. - df_co2[f"{column}_as_share_of_global"] = 100 * df_co2[column] / df_co2[f"global_{column}"] + tb_co2_with_regions[f"{column}_as_share_of_global"] = ( + 100 * tb_co2_with_regions[column] / tb_co2_with_regions[f"global_{column}"] + ) # Add share of global cumulative emissions. - df_co2[f"cumulative_{column}_as_share_of_global"] = ( - 100 * df_co2[f"cumulative_{column}"] / df_co2[f"global_cumulative_{column}"] + tb_co2_with_regions[f"cumulative_{column}_as_share_of_global"] = ( + 100 * tb_co2_with_regions[f"cumulative_{column}"] / tb_co2_with_regions[f"global_cumulative_{column}"] ) # Add total emissions per unit energy (in kg of emissions per kWh). - df_co2["emissions_total_per_unit_energy"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * df_co2["emissions_total"] / (df_co2["primary_energy_consumption"] * TWH_TO_KWH) + tb_co2_with_regions["emissions_total_per_unit_energy"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 + * tb_co2_with_regions["emissions_total"] + / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) ) # Add total emissions (including land-use change) per unit energy (in kg of emissions per kWh). - df_co2["emissions_total_including_land_use_change_per_unit_energy"] = ( + tb_co2_with_regions["emissions_total_including_land_use_change_per_unit_energy"] = ( TONNES_OF_CO2_TO_KG_OF_CO2 - * df_co2["emissions_total_including_land_use_change"] - / (df_co2["primary_energy_consumption"] * TWH_TO_KWH) + * tb_co2_with_regions["emissions_total_including_land_use_change"] + / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) ) # Add total emissions per unit GDP. - df_co2["emissions_total_per_gdp"] = TONNES_OF_CO2_TO_KG_OF_CO2 * df_co2["emissions_total"] / df_co2["gdp"] + tb_co2_with_regions["emissions_total_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["emissions_total"] / tb_co2_with_regions["gdp"] + ) # Add total emissions (including land-use change) per unit GDP. - df_co2["emissions_total_including_land_use_change_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * df_co2["emissions_total_including_land_use_change"] / df_co2["gdp"] + tb_co2_with_regions["emissions_total_including_land_use_change_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 + * tb_co2_with_regions["emissions_total_including_land_use_change"] + / tb_co2_with_regions["gdp"] ) # Add total consumption emissions per unit GDP. - df_co2["consumption_emissions_per_gdp"] = ( - TONNES_OF_CO2_TO_KG_OF_CO2 * df_co2["consumption_emissions"] / df_co2["gdp"] + tb_co2_with_regions["consumption_emissions_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["consumption_emissions"] / tb_co2_with_regions["gdp"] ) # Add variable of emissions embedded in trade. - df_co2["traded_emissions"] = df_co2["consumption_emissions"] - df_co2["emissions_total"] - df_co2["pct_traded_emissions"] = 100 * df_co2["traded_emissions"] / df_co2["emissions_total"] - df_co2["traded_emissions_per_capita"] = df_co2["traded_emissions"] / df_co2["population"] + tb_co2_with_regions["traded_emissions"] = ( + tb_co2_with_regions["consumption_emissions"] - tb_co2_with_regions["emissions_total"] + ) + tb_co2_with_regions["pct_traded_emissions"] = ( + 100 * tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["emissions_total"] + ) + tb_co2_with_regions["traded_emissions_per_capita"] = ( + tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["population"] + ) # Add variable of emissions embedded in trade, including land-use change emissions. - df_co2["traded_emissions_including_land_use_change"] = ( - df_co2["consumption_emissions"] - df_co2["emissions_total_including_land_use_change"] + tb_co2_with_regions["traded_emissions_including_land_use_change"] = ( + tb_co2_with_regions["consumption_emissions"] - tb_co2_with_regions["emissions_total_including_land_use_change"] ) - df_co2["pct_traded_emissions_including_land_use_change"] = ( - 100 * df_co2["traded_emissions_including_land_use_change"] / df_co2["emissions_total_including_land_use_change"] + tb_co2_with_regions["pct_traded_emissions_including_land_use_change"] = ( + 100 + * tb_co2_with_regions["traded_emissions_including_land_use_change"] + / tb_co2_with_regions["emissions_total_including_land_use_change"] ) - df_co2["traded_emissions_including_land_use_change_per_capita"] = ( - df_co2["traded_emissions_including_land_use_change"] / df_co2["population"] + tb_co2_with_regions["traded_emissions_including_land_use_change_per_capita"] = ( + tb_co2_with_regions["traded_emissions_including_land_use_change"] / tb_co2_with_regions["population"] ) # Remove temporary columns. - df_co2 = df_co2.drop(columns=["global_consumption_emissions", "global_cumulative_consumption_emissions"]) + tb_co2_with_regions = tb_co2_with_regions.drop( + columns=["global_consumption_emissions", "global_cumulative_consumption_emissions"] + ) # Add annual percentage growth of total emissions. - df_co2["pct_growth_emissions_total"] = df_co2.groupby("country")["emissions_total"].pct_change() * 100 + # NOTE: Currently, this operation doesn't propagate metadata properly. This has to be done manually. + tb_co2_with_regions["pct_growth_emissions_total"] = ( + tb_co2_with_regions.groupby("country")["emissions_total"].pct_change() * 100 + ).copy_metadata(tb_co2_with_regions["emissions_total"]) # Add annual percentage growth of total emissions (including land-use change). - df_co2["pct_growth_emissions_total_including_land_use_change"] = ( - df_co2.groupby("country")["emissions_total_including_land_use_change"].pct_change() * 100 - ) + # NOTE: Currently, this operation doesn't propagate metadata properly. This has to be done manually. + tb_co2_with_regions["pct_growth_emissions_total_including_land_use_change"] = ( + tb_co2_with_regions.groupby("country")["emissions_total_including_land_use_change"].pct_change() * 100 + ).copy_metadata(tb_co2_with_regions["emissions_total_including_land_use_change"]) # Add annual absolute growth of total emissions. - df_co2["growth_emissions_total"] = df_co2.groupby("country")["emissions_total"].diff() + tb_co2_with_regions["growth_emissions_total"] = tb_co2_with_regions.groupby("country")["emissions_total"].diff() # Add annual absolute growth of total emissions (including land-use change). - df_co2["growth_emissions_total_including_land_use_change"] = df_co2.groupby("country")[ + tb_co2_with_regions["growth_emissions_total_including_land_use_change"] = tb_co2_with_regions.groupby("country")[ "emissions_total_including_land_use_change" ].diff() # Create variable of population as a share of global population. - df_co2["population_as_share_of_global"] = df_co2["population"] / df_co2["global_population"] * 100 + tb_co2_with_regions["population_as_share_of_global"] = ( + tb_co2_with_regions["population"] / tb_co2_with_regions["global_population"] * 100 + ) # Replace infinity values (for example when calculating growth from zero to non-zero) in the data by nan. - for column in df_co2.drop(columns=["country", "year"]).columns: - df_co2.loc[np.isinf(df_co2[column]), column] = np.nan + for column in tb_co2_with_regions.drop(columns=["country", "year"]).columns: + tb_co2_with_regions.loc[np.isinf(tb_co2_with_regions[column]), column] = np.nan # For special GCP countries/regions (e.g. "Africa (GCP)") we should keep only the original data. # Therefore, make nan all additional variables for those countries/regions, and keep only GCP's original data. - added_variables = df_co2.drop(columns=["country", "year"] + COLUMNS_THAT_MUST_HAVE_DATA).columns.tolist() - df_co2.loc[(df_co2["country"].str.contains(" (GCP)", regex=False)), added_variables] = np.nan + added_variables = tb_co2_with_regions.drop( + columns=["country", "year"] + COLUMNS_THAT_MUST_HAVE_DATA + ).columns.tolist() + tb_co2_with_regions.loc[ + (tb_co2_with_regions["country"].str.contains(" (GCP)", regex=False)), added_variables + ] = np.nan # Remove uninformative rows (those that have only data for, say, gdp, but not for variables related to emissions). - df_co2 = df_co2.dropna(subset=COLUMNS_THAT_MUST_HAVE_DATA, how="all").reset_index(drop=True) + tb_co2_with_regions = tb_co2_with_regions.dropna(subset=COLUMNS_THAT_MUST_HAVE_DATA, how="all").reset_index( + drop=True + ) # Set an appropriate index, ensure there are no rows that only have nan, and sort conveniently. - df_co2 = df_co2.set_index(["country", "year"], verify_integrity=True) - df_co2 = df_co2.dropna(subset=df_co2.columns, how="all").sort_index().sort_index(axis=1) + tb_co2_with_regions = tb_co2_with_regions.set_index(["country", "year"], verify_integrity=True) + tb_co2_with_regions = ( + tb_co2_with_regions.dropna(subset=tb_co2_with_regions.columns, how="all").sort_index().sort_index(axis=1) + ) - # Create a table with the generated data. - tb_combined = Table(df_co2, short_name=paths.short_name, underscore=True) + # Rename table. + tb_co2_with_regions.metadata.short_name = paths.short_name - return tb_combined + return tb_co2_with_regions def run(dest_dir: str) -> None: @@ -846,89 +897,76 @@ def run(dest_dir: str) -> None: # # Load meadow dataset and read all its tables. ds_meadow: Dataset = paths.load_dependency("global_carbon_budget") - tb_co2 = ds_meadow["global_carbon_budget_fossil_co2_emissions"] - tb_historical = ds_meadow["global_carbon_budget_historical_budget"] - tb_consumption = ds_meadow["global_carbon_budget_consumption_emissions"] - tb_production = ds_meadow["global_carbon_budget_production_emissions"] - tb_land_use = ds_meadow["global_carbon_budget_land_use_change"] + tb_co2 = ds_meadow["global_carbon_budget_fossil_co2_emissions"].reset_index() + tb_historical = ds_meadow["global_carbon_budget_historical_budget"].reset_index() + tb_consumption = ds_meadow["global_carbon_budget_consumption_emissions"].reset_index() + tb_production = ds_meadow["global_carbon_budget_production_emissions"].reset_index() + tb_land_use = ds_meadow["global_carbon_budget_land_use_change"].reset_index() # Load primary energy consumption dataset and read its main table. ds_energy: Dataset = paths.load_dependency("primary_energy_consumption") - tb_energy = ds_energy["primary_energy_consumption"] + tb_energy = ds_energy["primary_energy_consumption"].reset_index() - # Load GDP dataset and read its main table. + # Load GDP dataset. ds_gdp: Dataset = paths.load_dependency("ggdc_maddison") - tb_gdp = ds_gdp["maddison_gdp"] - # Load population dataset and read its main table. + # Load population dataset. ds_population: Dataset = paths.load_dependency("population") - tb_population = ds_population["population"] - # Load regions dataset and read its main tables (it will be used to create region aggregates). + # Load regions dataset. ds_regions: Dataset = paths.load_dependency("regions") - # Load income groups dataset and read its main table (it will be used to create region aggregates). - ds_income_groups: Dataset = paths.load_dependency("wb_income") - - # Create a dataframe for each table. - df_co2 = pd.DataFrame(tb_co2).reset_index() - df_historical = pd.DataFrame(tb_historical).reset_index() - df_consumption = pd.DataFrame(tb_consumption).reset_index() - df_production = pd.DataFrame(tb_production).reset_index() - df_land_use = pd.DataFrame(tb_land_use).reset_index() - df_energy = pd.DataFrame(tb_energy).reset_index() - df_gdp = pd.DataFrame(tb_gdp).reset_index() - df_population = pd.DataFrame(tb_population).reset_index() + # Load income groups dataset. + ds_income_groups: Dataset = paths.load_dependency("income_groups") # # Process data. # # Prepare fossil CO2 emissions data. - df_co2 = prepare_fossil_co2_emissions(df_co2=df_co2) + tb_co2 = prepare_fossil_co2_emissions(tb_co2=tb_co2) # Prepare consumption-based emission data. - df_consumption = prepare_consumption_emissions(df_consumption=df_consumption) + tb_consumption = prepare_consumption_emissions(tb_consumption=tb_consumption) # Prepare production-based emission data. - df_production = prepare_production_emissions(df_production=df_production) + tb_production = prepare_production_emissions(tb_production=tb_production) # Prepare land-use emission data. - df_land_use = prepare_land_use_emissions(df_land_use=df_land_use) - - # Select and rename columns from primary energy data. - df_energy = df_energy[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS, errors="raise") + tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) # Select and rename columns from primary energy data. - df_gdp = df_gdp[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") + tb_energy = tb_energy[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS, errors="raise") # Prepare historical emissions data. - df_historical = prepare_historical_emissions(df_historical=df_historical) + tb_historical = prepare_historical_emissions(tb_historical=tb_historical) # Run sanity checks on input data. sanity_checks_on_input_data( - df_production=df_production, df_consumption=df_consumption, df_historical=df_historical, df_co2=df_co2 + tb_production=tb_production, tb_consumption=tb_consumption, tb_historical=tb_historical, tb_co2=tb_co2 ) # For some reason, "International Transport" is included as another country, that only has emissions from oil. # Extract that data and remove it from the rest of national emissions. - df_global_emissions = extract_global_emissions(df_co2=df_co2, df_historical=df_historical) + tb_global_emissions = extract_global_emissions( + tb_co2=tb_co2, tb_historical=tb_historical, ds_population=ds_population + ) # Harmonize country names. - df_co2 = harmonize_country_names(df=df_co2) - df_consumption = harmonize_country_names(df=df_consumption) - df_production = harmonize_country_names(df=df_production) - df_land_use = harmonize_country_names(df=df_land_use) + tb_co2 = harmonize_country_names(df=tb_co2) + tb_consumption = harmonize_country_names(df=tb_consumption) + tb_production = harmonize_country_names(df=tb_production) + tb_land_use = harmonize_country_names(df=tb_land_use) - # Add new variables to main dataframe (consumption-based emissions, emission intensity, per-capita emissions, etc.). + # Add new variables to main table (consumption-based emissions, emission intensity, per-capita emissions, etc.). tb_combined = combine_data_and_add_variables( - df_co2=df_co2, - df_production=df_production, - df_consumption=df_consumption, - df_global_emissions=df_global_emissions, - df_land_use=df_land_use, - df_gdp=df_gdp, - df_energy=df_energy, - df_population=df_population, + tb_co2=tb_co2, + tb_production=tb_production, + tb_consumption=tb_consumption, + tb_global_emissions=tb_global_emissions, + tb_land_use=tb_land_use, + tb_energy=tb_energy, + ds_gdp=ds_gdp, + ds_population=ds_population, ds_regions=ds_regions, ds_income_groups=ds_income_groups, ) @@ -940,6 +978,7 @@ def run(dest_dir: str) -> None: # Save outputs. # # Create a new garden dataset and use metadata from meadow dataset. - ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_combined], default_metadata=ds_meadow.metadata) - + ds_garden = create_dataset( + dest_dir=dest_dir, tables=[tb_combined], default_metadata=ds_meadow.metadata, check_variables_metadata=True + ) ds_garden.save() diff --git a/etl/steps/data/grapher/gcp/2023-07-10/global_carbon_budget.py b/etl/steps/data/grapher/gcp/2023-07-10/global_carbon_budget.py index 78d07f6e953..3b9a8a9740d 100644 --- a/etl/steps/data/grapher/gcp/2023-07-10/global_carbon_budget.py +++ b/etl/steps/data/grapher/gcp/2023-07-10/global_carbon_budget.py @@ -3,7 +3,6 @@ Some auxiliary variables will be added (where nans are filled with zeros, to avoid missing data in stacked area charts). """ -from copy import deepcopy import numpy as np import pandas as pd @@ -14,37 +13,13 @@ # Get paths and naming conventions for current step. paths = PathFinder(__file__) -# For two stacked area charts (namely "CO₂ emissions by fuel type" and "Cumulative CO₂ emissions by source") having -# nans in the data causes the chart to show only years where all sources have data. -# To avoid this, create additional variables that have nans filled with zeros. -VARIABLES_TO_FILL_WITH_ZEROS = [ - "emissions_total", - "emissions_from_cement", - "emissions_from_coal", - "emissions_from_flaring", - "emissions_from_gas", - "emissions_from_land_use_change", - "emissions_from_oil", - "emissions_from_other_industry", - "cumulative_emissions_total", - "cumulative_emissions_from_cement", - "cumulative_emissions_from_coal", - "cumulative_emissions_from_flaring", - "cumulative_emissions_from_gas", - "cumulative_emissions_from_land_use_change", - "cumulative_emissions_from_oil", - "cumulative_emissions_from_other_industry", -] - def run(dest_dir: str) -> None: # # Load inputs. # - # Load garden dataset. + # Load garden dataset and read its main table. ds_garden: Dataset = paths.load_dependency("global_carbon_budget") - - # Read table from garden dataset. tb_garden = ds_garden["global_carbon_budget"] # @@ -57,22 +32,13 @@ def run(dest_dir: str) -> None: years = np.arange(tb_garden.reset_index()["year"].min(), tb_garden.reset_index()["year"].max() + 1, dtype=int) tb_garden = tb_garden.reindex(pd.MultiIndex.from_product([countries, years], names=["country", "year"])) - # Create additional variables in the table that have nans filled with zeros (for two specific stacked area charts). - for variable in VARIABLES_TO_FILL_WITH_ZEROS: - new_variable_name = variable + "_zero_filled" - tb_garden[new_variable_name] = tb_garden[variable].fillna(0) - tb_garden[new_variable_name].metadata = deepcopy(tb_garden[variable].metadata) - tb_garden[new_variable_name].metadata.title = tb_garden[variable].metadata.title + " (zero filled)" - tb_garden[new_variable_name].metadata.description = ( - tb_garden[variable].metadata.description + " Missing data has been filled with zeros for the purposes of " - "data visualization." - ) - # # Save outputs. # # Create a new grapher dataset with the same metadata as the garden dataset. - ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + ds_grapher = create_dataset( + dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata, check_variables_metadata=True + ) # Sanity checks. grapher_checks(ds_grapher) diff --git a/etl/steps/data/meadow/gcp/2023-07-10/global_carbon_budget.py b/etl/steps/data/meadow/gcp/2023-07-10/global_carbon_budget.py index 06d59474b01..767a5a701d0 100644 --- a/etl/steps/data/meadow/gcp/2023-07-10/global_carbon_budget.py +++ b/etl/steps/data/meadow/gcp/2023-07-10/global_carbon_budget.py @@ -10,7 +10,7 @@ """ -import pandas as pd +import owid.catalog.processing as pr from owid.catalog import Table from structlog import get_logger @@ -24,33 +24,33 @@ paths = PathFinder(__file__) -def prepare_fossil_co2(df_fossil_co2: pd.DataFrame) -> Table: +def prepare_fossil_co2(tb_fossil_co2: Table) -> Table: # Set an appropriate index and sort conveniently. - df_fossil_co2 = df_fossil_co2.set_index(["Country", "Year"], verify_integrity=True).sort_index().sort_index(axis=1) + tb_fossil_co2 = tb_fossil_co2.set_index(["Country", "Year"], verify_integrity=True).sort_index().sort_index(axis=1) - # Create a new table and ensure all columns are snake-case. - tb_fossil_co2 = Table(df_fossil_co2, short_name="global_carbon_budget_fossil_co2_emissions", underscore=True) + # Ensure all columns are snake-case. + tb_fossil_co2 = tb_fossil_co2.underscore() return tb_fossil_co2 -def prepare_historical_budget(df_historical_budget: pd.DataFrame) -> Table: +def prepare_historical_budget(tb_historical: Table) -> Table: """Select variables and prepare the historical budget sheet of GCB's raw global data file. Parameters ---------- - df_historical_budget : pd.DataFrame + tb_historical : Table Historical budget sheet of GCB's raw global data file. Returns ------- - tb_historical_budget : Table + tb_historical : Table Historical budget after selecting variables and processing them. """ # Sanity check. error = "'Historical Budget' sheet in global data file has changed (consider changing 'skiprows')." - assert df_historical_budget.columns[0] == "Year", error + assert tb_historical.columns[0] == "Year", error # Columns to select in historical budget and how to rename them. columns = { @@ -58,30 +58,26 @@ def prepare_historical_budget(df_historical_budget: pd.DataFrame) -> Table: "fossil emissions excluding carbonation": "global_fossil_emissions", "land-use change emissions": "global_land_use_change_emissions", } - df_historical_budget = df_historical_budget[list(columns)].rename(columns=columns) + tb_historical = tb_historical[list(columns)].rename(columns=columns) # Add column for country (to be able to combine this with the national data). - df_historical_budget["country"] = "World" + tb_historical["country"] = "World" # Set an index and sort row and columns conveniently. - df_historical_budget = ( - df_historical_budget.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - ) + tb_historical = tb_historical.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - # Create a table with the generated data. - tb_historical_budget = Table( - df_historical_budget, short_name="global_carbon_budget_historical_budget", underscore=True - ) + # Rename table. + tb_historical.metadata.short_name = "global_carbon_budget_historical_budget" - return tb_historical_budget + return tb_historical -def prepare_land_use_emissions(df_land_use: pd.DataFrame) -> Table: +def prepare_land_use_emissions(tb_land_use: Table) -> Table: """Prepare data from a specific sheet of the land-use change data file. Parameters ---------- - df_land_use : pd.DataFrame + tb_land_use : Table Data from a specific sheet of the land-use change emissions data file. Returns @@ -90,16 +86,16 @@ def prepare_land_use_emissions(df_land_use: pd.DataFrame) -> Table: Processed land-use change emissions data. """ - df_land_use = df_land_use.copy() + tb_land_use = tb_land_use.copy() # Sanity check. error = "'BLUE' sheet in national land-use change data file has changed (consider changing 'skiprows')." - assert df_land_use.columns[1] == "Afghanistan", error + assert tb_land_use.columns[1] == "Afghanistan", error # Extract quality flag from the zeroth row of the data. # Ignore nans (which happen when a certain country has no data). quality_flag = ( - df_land_use.drop(columns=df_land_use.columns[0]) + tb_land_use.drop(columns=tb_land_use.columns[0]) .loc[0] .dropna() .astype(int) @@ -109,36 +105,39 @@ def prepare_land_use_emissions(df_land_use: pd.DataFrame) -> Table: ) # Drop the first row, which is for quality factor (which we have already extracted). - df_land_use = df_land_use.rename(columns={df_land_use.columns[0]: "year"}).drop(0) + tb_land_use = tb_land_use.rename(columns={tb_land_use.columns[0]: "year"}).drop(0) # Ignore countries that have no data. - df_land_use = df_land_use.dropna(axis=1, how="all") + tb_land_use = tb_land_use.dropna(axis=1, how="all") # Restructure data to have a column for country and another for emissions. - df_land_use = df_land_use.melt(id_vars="year", var_name="country", value_name="emissions") + tb_land_use = tb_land_use.melt(id_vars="year", var_name="country", value_name="emissions") error = "Countries with emissions data differ from countries with quality flag." - assert set(df_land_use["country"]) == set(quality_flag["country"]), error + assert set(tb_land_use["country"]) == set(quality_flag["country"]), error # Add quality factor as an additional column. - df_land_use = pd.merge(df_land_use, quality_flag, how="left", on="country") + tb_land_use = pr.merge(tb_land_use, quality_flag, how="left", on="country") + + # Copy metadata from another existing variable to the new quality flag. + tb_land_use["quality_flag"] = tb_land_use["quality_flag"].copy_metadata(tb_land_use["emissions"]) # Set an index and sort row and columns conveniently. - df_land_use = df_land_use.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + tb_land_use = tb_land_use.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - # Create a table with the generated data. - tb_land_use = Table(df_land_use, short_name="global_carbon_budget_land_use_change", underscore=True) + # Rename table. + tb_land_use.metadata.short_name = "global_carbon_budget_land_use_change" return tb_land_use -def prepare_national_emissions(df: pd.DataFrame, column_name: str) -> Table: +def prepare_national_emissions(tb: Table, column_name: str) -> Table: """Select variables and prepare the territorial emissions (or the consumption emissions) sheet of GCB's raw national data file. Parameters ---------- - df : pd.DataFrame + tb : Table Territorial emissions (or consumption emissions) sheet of GCB's raw national data file. column_name : str Name to assign to emissions column to be generated. @@ -149,13 +148,13 @@ def prepare_national_emissions(df: pd.DataFrame, column_name: str) -> Table: Processed territorial (or consumption) emissions sheet of GCB's raw national data file. """ - df = df.copy() + tb = tb.copy() error = f"Sheet in national data file for {column_name} has changed (consider changing 'skiprows')." - assert df.columns[1] == "Afghanistan", error + assert tb.columns[1] == "Afghanistan", error # The zeroth column is expected to be year. - df = df.rename(columns={df.columns[0]: "year"}) + tb = tb.rename(columns={tb.columns[0]: "year"}) # Each column represents a country; then the final columns are regions, "Bunkers", and "Statistical Difference". # Keep "Bunkers", but remove "Statistical Difference" (which is almost completely empty). @@ -163,23 +162,21 @@ def prepare_national_emissions(df: pd.DataFrame, column_name: str) -> Table: # handled at the garden step. # Remove unnecessary column. - df = df.drop(columns=["Statistical Difference"]) + tb = tb.drop(columns=["Statistical Difference"]) # Convert from wide to long format dataframe. - df = df.melt(id_vars=["year"]).rename(columns={"variable": "country", "value": column_name}) + tb = tb.melt(id_vars=["year"]).rename(columns={"variable": "country", "value": column_name}) # Set an index and sort row and columns conveniently. - df = df.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - # Create a table with the generated data. - tb_national = Table(df, short_name=f"global_carbon_budget_{column_name}", underscore=True) + # Rename table. + tb.metadata.short_name = f"global_carbon_budget_{column_name}" - return tb_national + return tb def run(dest_dir: str) -> None: - log.info("global_carbon_budget.start") - # # Load inputs. # @@ -190,37 +187,45 @@ def run(dest_dir: str) -> None: snap_land_use: Snapshot = paths.load_dependency("global_carbon_budget_land_use_change_emissions.xlsx") # Load data from fossil CO2 emissions. - df_fossil_co2 = pd.read_csv(snap_fossil_co2.path) + tb_fossil_co2 = pr.read_csv(snap_fossil_co2.path, metadata=snap_fossil_co2.to_table_metadata()) # Load historical budget from the global emissions file. - df_historical = pd.read_excel(snap_global.path, sheet_name="Historical Budget", skiprows=15) + tb_historical = pr.read_excel( + snap_global.path, sheet_name="Historical Budget", skiprows=15, metadata=snap_global.to_table_metadata() + ) # Load land-use emissions. - df_land_use = pd.read_excel(snap_land_use.path, sheet_name="BLUE", skiprows=7) + tb_land_use = pr.read_excel( + snap_land_use.path, sheet_name="BLUE", skiprows=7, metadata=snap_land_use.to_table_metadata() + ) # Load production-based national emissions. - df_production = pd.read_excel(snap_national.path, sheet_name="Territorial Emissions", skiprows=11) + tb_production = pr.read_excel( + snap_national.path, sheet_name="Territorial Emissions", skiprows=11, metadata=snap_national.to_table_metadata() + ) # Load consumption-based national emissions. - df_consumption = pd.read_excel(snap_national.path, sheet_name="Consumption Emissions", skiprows=8) + tb_consumption = pr.read_excel( + snap_national.path, sheet_name="Consumption Emissions", skiprows=8, metadata=snap_national.to_table_metadata() + ) # # Process data. # # Prepare data for fossil CO2 emissions. - tb_fossil_co2 = prepare_fossil_co2(df_fossil_co2=df_fossil_co2) + tb_fossil_co2 = prepare_fossil_co2(tb_fossil_co2=tb_fossil_co2) # Prepare data for historical emissions. - tb_historical = prepare_historical_budget(df_historical_budget=df_historical) + tb_historical = prepare_historical_budget(tb_historical=tb_historical) # Prepare data for land-use emissions. - tb_land_use = prepare_land_use_emissions(df_land_use=df_land_use) + tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) # Prepare data for production-based emissions, from the file of national emissions. - tb_production = prepare_national_emissions(df=df_production, column_name="production_emissions") + tb_production = prepare_national_emissions(tb=tb_production, column_name="production_emissions") # Prepare data for consumption-based emissions, from the file of national emissions. - tb_consumption = prepare_national_emissions(df=df_consumption, column_name="consumption_emissions") + tb_consumption = prepare_national_emissions(tb=tb_consumption, column_name="consumption_emissions") # # Save outputs. @@ -230,9 +235,6 @@ def run(dest_dir: str) -> None: dest_dir, tables=[tb_fossil_co2, tb_historical, tb_land_use, tb_production, tb_consumption], default_metadata=snap_fossil_co2.metadata, + check_variables_metadata=True, ) - - # Save changes in the new garden dataset. ds_meadow.save() - - log.info("global_carbon_budget.end") diff --git a/snapshots/gcp/2023-04-28/global_carbon_budget_fossil_co2_emissions.csv.dvc b/snapshots/gcp/2023-04-28/global_carbon_budget_fossil_co2_emissions.csv.dvc index 609951fd55e..7e58cf8da36 100644 --- a/snapshots/gcp/2023-04-28/global_carbon_budget_fossil_co2_emissions.csv.dvc +++ b/snapshots/gcp/2023-04-28/global_carbon_budget_fossil_co2_emissions.csv.dvc @@ -4,7 +4,7 @@ meta: name: Global Carbon Budget - Fossil CO2 emissions version: '2023-04-28' publication_date: '2022-11-11' - source_name: Global Carbon Project + source_name: Global Carbon Budget (2022) source_published_by: "Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew,\ \ R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters,\ \ G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J.\ @@ -33,7 +33,6 @@ meta: date_accessed: 2023-04-28 is_public: true description: | - Fossil CO2 emissions dataset (long csv file), containing global and national data on fossil fuel CO2 emissions from 1750 until today. wdir: ../../../data/snapshots/gcp/2023-04-28 outs: - md5: 8a9511bb7d2623ac32d6cf49707c09d0 diff --git a/snapshots/gcp/2023-04-28/global_carbon_budget_global_emissions.xlsx.dvc b/snapshots/gcp/2023-04-28/global_carbon_budget_global_emissions.xlsx.dvc index a1c771a80f9..0d04502eadd 100644 --- a/snapshots/gcp/2023-04-28/global_carbon_budget_global_emissions.xlsx.dvc +++ b/snapshots/gcp/2023-04-28/global_carbon_budget_global_emissions.xlsx.dvc @@ -4,7 +4,7 @@ meta: name: Global Carbon Budget - Global emissions version: '2023-04-28' publication_date: '2022-11-11' - source_name: Global Carbon Project + source_name: Global Carbon Budget (2022) source_published_by: "Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew,\ \ R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters,\ \ G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J.\ diff --git a/snapshots/gcp/2023-04-28/global_carbon_budget_land_use_change_emissions.xlsx.dvc b/snapshots/gcp/2023-04-28/global_carbon_budget_land_use_change_emissions.xlsx.dvc index e6549df09cd..c9b052afe5a 100644 --- a/snapshots/gcp/2023-04-28/global_carbon_budget_land_use_change_emissions.xlsx.dvc +++ b/snapshots/gcp/2023-04-28/global_carbon_budget_land_use_change_emissions.xlsx.dvc @@ -4,7 +4,7 @@ meta: name: Global Carbon Budget - Land-use change emissions version: '2023-04-28' publication_date: '2022-11-11' - source_name: Global Carbon Project + source_name: Global Carbon Budget (2022) source_published_by: "Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew,\ \ R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters,\ \ G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J.\ diff --git a/snapshots/gcp/2023-04-28/global_carbon_budget_national_emissions.xlsx.dvc b/snapshots/gcp/2023-04-28/global_carbon_budget_national_emissions.xlsx.dvc index db415addd7b..5f7101b25a5 100644 --- a/snapshots/gcp/2023-04-28/global_carbon_budget_national_emissions.xlsx.dvc +++ b/snapshots/gcp/2023-04-28/global_carbon_budget_national_emissions.xlsx.dvc @@ -4,7 +4,7 @@ meta: name: Global Carbon Budget - National emissions version: '2023-04-28' publication_date: '2022-11-11' - source_name: Global Carbon Project + source_name: Global Carbon Budget (2022) source_published_by: "Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew,\ \ R. M., Gregor, L., Hauck, J., Le Quéré, C., Luijkx, I. T., Olsen, A., Peters,\ \ G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J.\ From a7bc01af90e1d553e4a6e7764c1f13d73f6d4930 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 12:32:11 +0200 Subject: [PATCH 04/16] feat(emissions): Update owid_co2 dataset --- etl/steps/data/garden/emissions/2023-07-10/owid_co2.meta.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/etl/steps/data/garden/emissions/2023-07-10/owid_co2.meta.yml b/etl/steps/data/garden/emissions/2023-07-10/owid_co2.meta.yml index 080d012dcff..ee1d1cfd50c 100644 --- a/etl/steps/data/garden/emissions/2023-07-10/owid_co2.meta.yml +++ b/etl/steps/data/garden/emissions/2023-07-10/owid_co2.meta.yml @@ -1,5 +1,5 @@ dataset: - title: OWID CO2 dataset (2023) + title: OWID CO2 dataset (2023b) description: | OWID CO2 dataset. @@ -7,6 +7,3 @@ dataset: # Dataset sources will be created in the step by combining all component datasets' sources. # Also, table metadata will be built from the tables' original metadata. - -tables: - {} From 02b873d781d264c0232ccd5fbfd7f3bea1475b2c Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 12:55:42 +0200 Subject: [PATCH 05/16] fix(helpers): Remove version_tracker test that ensures that the latest version of a step is in the active dag --- etl/helpers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etl/helpers.py b/etl/helpers.py index 5706037e0f1..3038ae1dba1 100644 --- a/etl/helpers.py +++ b/etl/helpers.py @@ -827,6 +827,8 @@ def check_that_archive_steps_are_not_dependencies_of_active_steps(self) -> None: def check_that_latest_version_of_steps_are_active(self) -> None: # Check that the latest version of each main data step is in the dag. # If not, it could be because it has been deleted by accident. + # We may decide to remove this test, because it will raise an error if an old step is archived, and it has no + # newer version. This can happen for example if the name of the step was changed during the update. latest_data_steps = set( self.step_attributes_df[ (self.step_attributes_df["n_newer_versions"] == 0) @@ -873,7 +875,7 @@ def get_backported_db_dataset_ids(self) -> List[int]: def apply_sanity_checks(self) -> None: self.check_that_archive_steps_are_not_dependencies_of_active_steps() - self.check_that_latest_version_of_steps_are_active() + # self.check_that_latest_version_of_steps_are_active() self.check_that_all_active_steps_are_necessary() From 80dd7883eec978b843bd8be8abc6cbfe30d1319d Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 12:56:19 +0200 Subject: [PATCH 06/16] Create new version of photovoltaic cost and capacity dataset, and archive unused steps --- dag/archive/emissions.yml | 32 ++++ dag/archive/energy.yml | 152 ++++++++++++++++++ dag/emissions.yml | 34 +--- dag/energy.yml | 150 +---------------- .../photovoltaic_cost_and_capacity.meta.yml | 87 ++++++++++ .../photovoltaic_cost_and_capacity.py | 144 +++++++++++++++++ .../photovoltaic_cost_and_capacity.py | 23 +++ 7 files changed, 444 insertions(+), 178 deletions(-) create mode 100644 etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.meta.yml create mode 100644 etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.py create mode 100644 etl/steps/data/grapher/energy/2023-07-10/photovoltaic_cost_and_capacity.py diff --git a/dag/archive/emissions.yml b/dag/archive/emissions.yml index 4459eba6d3e..06d2ed7b07d 100644 --- a/dag/archive/emissions.yml +++ b/dag/archive/emissions.yml @@ -56,3 +56,35 @@ steps: - data://garden/energy/2022-07-29/primary_energy_consumption data://grapher/gcp/2022-11-11/global_carbon_budget: - data://garden/gcp/2022-11-11/global_carbon_budget + # + # GCP - Global Carbon Budget (2023-04-28). + # + data://meadow/gcp/2023-04-28/global_carbon_budget: + - snapshot://gcp/2023-04-28/global_carbon_budget_fossil_co2_emissions.csv + - snapshot://gcp/2023-04-28/global_carbon_budget_global_emissions.xlsx + - snapshot://gcp/2023-04-28/global_carbon_budget_national_emissions.xlsx + - snapshot://gcp/2023-04-28/global_carbon_budget_land_use_change_emissions.xlsx + data://garden/gcp/2023-04-28/global_carbon_budget: + - data://meadow/gcp/2023-04-28/global_carbon_budget + # Loaded to calculate emissions per unit energy. + - data://garden/energy/2023-06-01/primary_energy_consumption + # Loaded to calculate emissions per GDP. + - data://garden/ggdc/2020-10-01/ggdc_maddison + # Loaded to create per-capita variables. + - data://garden/demography/2022-12-08/population + # Loaded to create region aggregates (including income groups). + - data://garden/regions/2023-01-01/regions + - data://garden/wb/2021-07-01/wb_income + data://grapher/gcp/2023-04-28/global_carbon_budget: + - data://garden/gcp/2023-04-28/global_carbon_budget + # + # Emissions - CO2 dataset (2022). + # + data://garden/emissions/2023-05-03/owid_co2: + - data://garden/emissions/2023-05-02/national_contributions + - data://garden/gcp/2023-04-28/global_carbon_budget + - data://garden/cait/2022-08-10/ghg_emissions_by_sector + - data://garden/energy/2023-06-01/primary_energy_consumption + - data://garden/demography/2022-12-08/population + - data://garden/ggdc/2020-10-01/ggdc_maddison + - data://garden/regions/2023-01-01/regions diff --git a/dag/archive/energy.yml b/dag/archive/energy.yml index 596685b1443..7cad741be07 100644 --- a/dag/archive/energy.yml +++ b/dag/archive/energy.yml @@ -265,3 +265,155 @@ steps: - data://garden/demography/2023-03-31/population - data://garden/ggdc/2020-10-01/ggdc_maddison - data://garden/regions/2023-01-01/regions + # + # IRENA - Renewable electricity capacity (and generation, although the generation part is ignored for now, 2022). + # + data://meadow/irena/2023-01-04/renewable_electricity_capacity_and_generation: + - walden://irena/2022-10-07/renewable_electricity_capacity_and_generation + data://garden/irena/2023-01-04/renewable_electricity_capacity: + - data://meadow/irena/2023-01-04/renewable_electricity_capacity_and_generation + data://grapher/irena/2023-01-04/renewable_electricity_capacity_by_technology: + - data://garden/irena/2023-01-04/renewable_electricity_capacity + # + # Energy - Photovoltaic cost and capacity. + # + data://garden/energy/2023-01-04/photovoltaic_cost_and_capacity: + - data://garden/papers/2023-01-04/nemet_2009 + - data://garden/papers/2023-01-04/farmer_lafond_2016 + - data://garden/irena/2023-01-04/renewable_electricity_capacity + - data://garden/irena/2023-01-04/renewable_power_generation_costs + data://grapher/energy/2023-01-04/photovoltaic_cost_and_capacity: + - data://garden/energy/2023-01-04/photovoltaic_cost_and_capacity + # + # BP - Statistical review 2021. + # + # NOTE: This dataset is not used in grapher. It exists only to fill gaps in the 2022 version. + data://garden/bp/2022-07-11/statistical_review: + - backport://backport/owid/latest/dataset_5347_statistical_review_of_world_energy__bp__2021 + - data://garden/owid/latest/key_indicators + - data://garden/wb/2021-07-01/wb_income + - data://garden/regions/2023-01-01/regions + # + # BP - Statistical review 2022. + # + # NOTE: For the moment this is not the full processing (which is still done in importers). + # This garden step loads the dataset and adds region aggregates properly, plus some other minor improvements. + # Here, we also remove some regions that had misleading data (BP regions like "Other *"). + data://garden/bp/2022-12-28/statistical_review: + # The backported 2022 release is the main source of data of this step. + - backport://backport/owid/latest/dataset_5650_statistical_review_of_world_energy__bp__2022 + # The 2021 release is loaded just to fill missing data in the current version (and to get a missing column). + - data://garden/bp/2022-07-11/statistical_review + - data://garden/owid/latest/key_indicators + - data://garden/wb/2021-07-01/wb_income + - data://garden/regions/2023-01-01/regions + data://grapher/bp/2022-12-28/statistical_review: + - data://garden/bp/2022-12-28/statistical_review + # + # BP - Fossil fuel reserves/production ratio (2022). + # + data://garden/bp/2022-12-28/fossil_fuel_reserves_production_ratio: + - data://garden/bp/2022-12-28/statistical_review + data://grapher/bp/2022-12-28/fossil_fuel_reserves_production_ratio: + - data://garden/bp/2022-12-28/fossil_fuel_reserves_production_ratio + # + # Energy - Primary energy consumption 2023. + # + data://garden/energy/2023-06-01/primary_energy_consumption: + - data://garden/bp/2022-12-28/statistical_review + - data://garden/eia/2022-07-27/energy_consumption + - data://garden/ggdc/2020-10-01/ggdc_maddison + - data://garden/demography/2023-03-31/population + data://grapher/energy/2023-06-01/primary_energy_consumption: + - data://garden/energy/2023-06-01/primary_energy_consumption + # + # EIA - Energy consumption 2022. + # + data://meadow/eia/2022-07-27/energy_consumption: + - walden://eia/2022-07-27/international_energy_data + data://garden/eia/2022-07-27/energy_consumption: + - data://meadow/eia/2022-07-27/energy_consumption + - data://garden/owid/latest/key_indicators + - data://garden/regions/2023-01-01/regions + # + # Shift - Fossil fuel production 2022. + # + data://meadow/shift/2022-07-18/fossil_fuel_production: + - walden://shift/2022-07-18/fossil_fuel_production + data://garden/shift/2022-07-18/fossil_fuel_production: + - data://meadow/shift/2022-07-18/fossil_fuel_production + - data://garden/regions/2023-01-01/regions + # + # Energy - Fossil Fuel Production 2023. + # + data://garden/energy/2023-06-01/fossil_fuel_production: + - data://garden/bp/2022-12-28/statistical_review + - data://garden/shift/2022-07-18/fossil_fuel_production + - data://garden/demography/2023-03-31/population + data://grapher/energy/2023-06-01/fossil_fuel_production: + - data://garden/energy/2023-06-01/fossil_fuel_production + # + # Smil - Global primary energy (2017). + # + data://meadow/smil/2017-01-01/global_primary_energy: + - walden://smil/2017-01-01/global_primary_energy + data://garden/smil/2017-01-01/global_primary_energy: + - data://meadow/smil/2017-01-01/global_primary_energy + # + # Energy - Global primary energy (2023). + # + data://garden/energy/2023-06-01/global_primary_energy: + - data://garden/smil/2017-01-01/global_primary_energy + - data://garden/bp/2022-12-28/statistical_review + data://grapher/energy/2023-06-01/global_primary_energy: + - data://garden/energy/2023-06-01/global_primary_energy + # + # Ember - Yearly electricity data 2023. + # + data://meadow/ember/2023-06-01/yearly_electricity: + - snapshot://ember/2023-06-01/yearly_electricity.csv + data://garden/ember/2023-06-01/yearly_electricity: + - data://meadow/ember/2023-06-01/yearly_electricity + - data://garden/demography/2023-03-31/population + - data://garden/wb/2023-04-30/income_groups + - data://garden/regions/2023-01-01/regions + # + # Ember - Combined electricity 2023. + # + # We still use EER 2022 for data from 1990-1999, which is not included in the 2023 version. + data://garden/ember/2023-06-01/combined_electricity: + - data://garden/ember/2022-08-01/european_electricity_review + - data://garden/ember/2023-06-01/yearly_electricity + # + # Energy - Electricity mix (BP & Ember, 2023). + # + data://garden/energy/2023-06-01/electricity_mix: + - data://garden/bp/2022-12-28/statistical_review + - data://garden/ember/2023-06-01/combined_electricity + - data://garden/demography/2023-03-31/population + data://grapher/energy/2023-06-01/electricity_mix: + - data://garden/energy/2023-06-01/electricity_mix + # + # UK BEIS - UK historical electricity (2022). + # + data://meadow/uk_beis/2022-07-28/uk_historical_electricity: + - walden://uk_beis/2022-07-28/uk_historical_electricity + data://garden/uk_beis/2022-07-28/uk_historical_electricity: + - data://meadow/uk_beis/2022-07-28/uk_historical_electricity + # + # Energy - UK historical electricity (2023). + # + data://garden/energy/2023-06-01/uk_historical_electricity: + - data://garden/uk_beis/2022-07-28/uk_historical_electricity + - data://garden/energy/2023-06-01/electricity_mix + data://grapher/energy/2023-06-01/uk_historical_electricity: + - data://garden/energy/2023-06-01/uk_historical_electricity + # + # BP - Energy mix 2023. + # + data://garden/bp/2023-02-20/energy_mix: + - data://garden/bp/2022-12-28/statistical_review + - data://garden/demography/2022-12-08/population + - data://garden/wb/2021-07-01/wb_income + data://grapher/bp/2023-02-20/energy_mix: + - data://garden/bp/2023-02-20/energy_mix diff --git a/dag/emissions.yml b/dag/emissions.yml index e59e2f17a06..19ce979b298 100644 --- a/dag/emissions.yml +++ b/dag/emissions.yml @@ -99,35 +99,5 @@ steps: - data://garden/emissions/2023-05-02/national_contributions ###################################################################################################################### # Older versions to be archived once they are not used by any other steps. - # - # GCP - Global Carbon Budget (2023-04-28). - # - data://meadow/gcp/2023-04-28/global_carbon_budget: - - snapshot://gcp/2023-04-28/global_carbon_budget_fossil_co2_emissions.csv - - snapshot://gcp/2023-04-28/global_carbon_budget_global_emissions.xlsx - - snapshot://gcp/2023-04-28/global_carbon_budget_national_emissions.xlsx - - snapshot://gcp/2023-04-28/global_carbon_budget_land_use_change_emissions.xlsx - data://garden/gcp/2023-04-28/global_carbon_budget: - - data://meadow/gcp/2023-04-28/global_carbon_budget - # Loaded to calculate emissions per unit energy. - - data://garden/energy/2023-06-01/primary_energy_consumption - # Loaded to calculate emissions per GDP. - - data://garden/ggdc/2020-10-01/ggdc_maddison - # Loaded to create per-capita variables. - - data://garden/demography/2022-12-08/population - # Loaded to create region aggregates (including income groups). - - data://garden/regions/2023-01-01/regions - - data://garden/wb/2021-07-01/wb_income - data://grapher/gcp/2023-04-28/global_carbon_budget: - - data://garden/gcp/2023-04-28/global_carbon_budget - # - # Emissions - CO2 dataset (2022). - # - data://garden/emissions/2023-05-03/owid_co2: - - data://garden/emissions/2023-05-02/national_contributions - - data://garden/gcp/2023-04-28/global_carbon_budget - - data://garden/cait/2022-08-10/ghg_emissions_by_sector - - data://garden/energy/2023-06-01/primary_energy_consumption - - data://garden/demography/2022-12-08/population - - data://garden/ggdc/2020-10-01/ggdc_maddison - - data://garden/regions/2023-01-01/regions + + ###################################################################################################################### diff --git a/dag/energy.yml b/dag/energy.yml index 0da9a318877..e428c68ae36 100644 --- a/dag/energy.yml +++ b/dag/energy.yml @@ -65,13 +65,13 @@ steps: # # Energy - Photovoltaic cost and capacity. # - data://garden/energy/2023-01-04/photovoltaic_cost_and_capacity: + data://garden/energy/2023-07-10/photovoltaic_cost_and_capacity: - data://garden/papers/2023-01-04/nemet_2009 - data://garden/papers/2023-01-04/farmer_lafond_2016 - - data://garden/irena/2023-01-04/renewable_electricity_capacity + - data://garden/irena/2023-06-26/renewable_electricity_capacity - data://garden/irena/2023-01-04/renewable_power_generation_costs - data://grapher/energy/2023-01-04/photovoltaic_cost_and_capacity: - - data://garden/energy/2023-01-04/photovoltaic_cost_and_capacity + data://grapher/energy/2023-07-10/photovoltaic_cost_and_capacity: + - data://garden/energy/2023-07-10/photovoltaic_cost_and_capacity # # Energy Institute - Statistical Review of World Energy (2023). # @@ -207,147 +207,5 @@ steps: ###################################################################################################################### # Older versions to be archived once they are not used by any other steps. - # - # IRENA - Renewable electricity capacity (and generation, although the generation part is ignored for now, 2022). - # - data://meadow/irena/2023-01-04/renewable_electricity_capacity_and_generation: - - walden://irena/2022-10-07/renewable_electricity_capacity_and_generation - data://garden/irena/2023-01-04/renewable_electricity_capacity: - - data://meadow/irena/2023-01-04/renewable_electricity_capacity_and_generation - data://grapher/irena/2023-01-04/renewable_electricity_capacity_by_technology: - - data://garden/irena/2023-01-04/renewable_electricity_capacity - # - # BP - Statistical review 2021. - # - # NOTE: This dataset is not used in grapher. It exists only to fill gaps in the 2022 version. - data://garden/bp/2022-07-11/statistical_review: - - backport://backport/owid/latest/dataset_5347_statistical_review_of_world_energy__bp__2021 - - data://garden/owid/latest/key_indicators - - data://garden/wb/2021-07-01/wb_income - - data://garden/regions/2023-01-01/regions - # - # BP - Statistical review 2022. - # - # NOTE: For the moment this is not the full processing (which is still done in importers). - # This garden step loads the dataset and adds region aggregates properly, plus some other minor improvements. - # Here, we also remove some regions that had misleading data (BP regions like "Other *"). - data://garden/bp/2022-12-28/statistical_review: - # The backported 2022 release is the main source of data of this step. - - backport://backport/owid/latest/dataset_5650_statistical_review_of_world_energy__bp__2022 - # The 2021 release is loaded just to fill missing data in the current version (and to get a missing column). - - data://garden/bp/2022-07-11/statistical_review - - data://garden/owid/latest/key_indicators - - data://garden/wb/2021-07-01/wb_income - - data://garden/regions/2023-01-01/regions - data://grapher/bp/2022-12-28/statistical_review: - - data://garden/bp/2022-12-28/statistical_review - # - # BP - Fossil fuel reserves/production ratio (2022). - # - data://garden/bp/2022-12-28/fossil_fuel_reserves_production_ratio: - - data://garden/bp/2022-12-28/statistical_review - data://grapher/bp/2022-12-28/fossil_fuel_reserves_production_ratio: - - data://garden/bp/2022-12-28/fossil_fuel_reserves_production_ratio - # - # Energy - Primary energy consumption 2023. - # - data://garden/energy/2023-06-01/primary_energy_consumption: - - data://garden/bp/2022-12-28/statistical_review - - data://garden/eia/2022-07-27/energy_consumption - - data://garden/ggdc/2020-10-01/ggdc_maddison - - data://garden/demography/2023-03-31/population - data://grapher/energy/2023-06-01/primary_energy_consumption: - - data://garden/energy/2023-06-01/primary_energy_consumption - # - # EIA - Energy consumption 2022. - # - data://meadow/eia/2022-07-27/energy_consumption: - - walden://eia/2022-07-27/international_energy_data - data://garden/eia/2022-07-27/energy_consumption: - - data://meadow/eia/2022-07-27/energy_consumption - - data://garden/owid/latest/key_indicators - - data://garden/regions/2023-01-01/regions - # - # Shift - Fossil fuel production 2022. - # - data://meadow/shift/2022-07-18/fossil_fuel_production: - - walden://shift/2022-07-18/fossil_fuel_production - data://garden/shift/2022-07-18/fossil_fuel_production: - - data://meadow/shift/2022-07-18/fossil_fuel_production - - data://garden/regions/2023-01-01/regions - # - # Energy - Fossil Fuel Production 2023. - # - data://garden/energy/2023-06-01/fossil_fuel_production: - - data://garden/bp/2022-12-28/statistical_review - - data://garden/shift/2022-07-18/fossil_fuel_production - - data://garden/demography/2023-03-31/population - data://grapher/energy/2023-06-01/fossil_fuel_production: - - data://garden/energy/2023-06-01/fossil_fuel_production - # - # Smil - Global primary energy (2017). - # - data://meadow/smil/2017-01-01/global_primary_energy: - - walden://smil/2017-01-01/global_primary_energy - data://garden/smil/2017-01-01/global_primary_energy: - - data://meadow/smil/2017-01-01/global_primary_energy - # - # Energy - Global primary energy (2023). - # - data://garden/energy/2023-06-01/global_primary_energy: - - data://garden/smil/2017-01-01/global_primary_energy - - data://garden/bp/2022-12-28/statistical_review - data://grapher/energy/2023-06-01/global_primary_energy: - - data://garden/energy/2023-06-01/global_primary_energy - # - # Ember - Yearly electricity data 2023. - # - data://meadow/ember/2023-06-01/yearly_electricity: - - snapshot://ember/2023-06-01/yearly_electricity.csv - data://garden/ember/2023-06-01/yearly_electricity: - - data://meadow/ember/2023-06-01/yearly_electricity - - data://garden/demography/2023-03-31/population - - data://garden/wb/2023-04-30/income_groups - - data://garden/regions/2023-01-01/regions - # - # Ember - Combined electricity 2023. - # - # We still use EER 2022 for data from 1990-1999, which is not included in the 2023 version. - data://garden/ember/2023-06-01/combined_electricity: - - data://garden/ember/2022-08-01/european_electricity_review - - data://garden/ember/2023-06-01/yearly_electricity - # - # Energy - Electricity mix (BP & Ember, 2023). - # - data://garden/energy/2023-06-01/electricity_mix: - - data://garden/bp/2022-12-28/statistical_review - - data://garden/ember/2023-06-01/combined_electricity - - data://garden/demography/2023-03-31/population - data://grapher/energy/2023-06-01/electricity_mix: - - data://garden/energy/2023-06-01/electricity_mix - # - # UK BEIS - UK historical electricity (2022). - # - data://meadow/uk_beis/2022-07-28/uk_historical_electricity: - - walden://uk_beis/2022-07-28/uk_historical_electricity - data://garden/uk_beis/2022-07-28/uk_historical_electricity: - - data://meadow/uk_beis/2022-07-28/uk_historical_electricity - # - # Energy - UK historical electricity (2023). - # - data://garden/energy/2023-06-01/uk_historical_electricity: - - data://garden/uk_beis/2022-07-28/uk_historical_electricity - - data://garden/energy/2023-06-01/electricity_mix - data://grapher/energy/2023-06-01/uk_historical_electricity: - - data://garden/energy/2023-06-01/uk_historical_electricity - # - # BP - Energy mix 2023. - # - data://garden/bp/2023-02-20/energy_mix: - - data://garden/bp/2022-12-28/statistical_review - - data://garden/demography/2022-12-08/population - - data://garden/wb/2021-07-01/wb_income - data://grapher/bp/2023-02-20/energy_mix: - - data://garden/bp/2023-02-20/energy_mix ###################################################################################################################### diff --git a/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.meta.yml b/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.meta.yml new file mode 100644 index 00000000000..43e3f753454 --- /dev/null +++ b/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.meta.yml @@ -0,0 +1,87 @@ +all_sources: + - nemet_2009: &source-nemet_2009 + name: G. G. Nemet (2009) + published_by: | + Interim monitoring of cost dynamics for publicly supported energy technologies. Energy Policy 37(3): 825-835. by Nemet, G. F. (2009). + url: https://www.sciencedirect.com/science/article/abs/pii/S0301421508005910 + date_accessed: '2023-01-04' + publication_date: '2009-03-01' + publication_year: 2009 + description: | + Photovoltaic cost and capacity data between 1975 and 2003 has been taken from Nemet (2009). + + Prices from Nemet (2009) have been converted to 2021 US$ using the US GDP deflator: https://www.multpl.com/gdp-deflator/table/by-year + - farmer_lafond_2016: &source-farmer_lafond_2016 + name: J. D. Farmer & F. Lafond (2016) + published_by: | + How predictable is technological progress? J. D. Farmer & F. Lafond, Research Policy Volume 45, Issue 3, April 2016, Pages 647-665. + url: https://www.sciencedirect.com/science/article/pii/S0048733315001699 + date_accessed: '2023-01-04' + publication_date: '2016-04-01' + publication_year: 2016 + description: | + Photovoltaic cost data between 2004 and 2009 has been taken from Farmer & Lafond (2016). + + According to Farmer & Lafond (2016), the data are mostly taken from the Santa-Fe Performance Curve DataBase, accessible at pcdb.santafe.edu. The database has been constructed from personal communications and from Colpier and Cornland (2002), Goldemberg et al. (2004), Lieberman (1984), Lipman and Sperling (1999), Zhao (1999), McDonald and Schrattenholzer (2001), Neij et al. (2003), Moore (2006), Nemet (2006), Schilling and Esmundo (2009). The data on photovoltaic prices has been collected from public releases of Strategies Unlimited, Navigant and SPV Market Research. The data on nuclear energy is from Koomey and Hultman (2007) and Cooper (2009). The DNA sequencing data is from Wetterstrand (2015) (cost per human-size genome), and for each year the last available month (September for 2001-2002 and October afterwards) was taken and corrected for inflation using the US GDP deflator. + + Prices from Farmer & Lafond (2016) have been converted to 2021 US$ using the US GDP deflator: https://www.multpl.com/gdp-deflator/table/by-year + - irena_capacity: &source-irena_capacity + name: International Renewable Energy Agency (IRENA) + published_by: © 2022 by International Renewable Energy Agency (IRENA) + url: https://www.irena.org/Statistics/Download-query-tools + date_accessed: '2022-10-20' + publication_date: '2022-07-01' + publication_year: 2022 + description: | + Photovoltaic capacity data between 2004 and 2021 has been taken from IRENA. + - irena_costs: &source-irena_costs + name: International Renewable Energy Agency (IRENA) + published_by: International Renewable Energy Agency (IRENA) © 2022 by IRENA + url: https://irena.org/publications/2022/Jul/Renewable-Power-Generation-Costs-in-2021 + date_accessed: '2022-10-20' + publication_year: 2022 + description: | + Photovoltaic cost data between 2010 and 2021 has been taken from IRENA. + +dataset: + title: Solar photovoltaic cost and capacity (Energy, 2023b) + description: | + Prices from Nemet (2009) and from Farmer & LaFond (2016) have been converted to 2021 US$ using the US GDP deflator: https://www.multpl.com/gdp-deflator/table/by-year + sources: + - *source-nemet_2009 + - *source-farmer_lafond_2016 + - *source-irena_capacity + - *source-irena_costs + +tables: + photovoltaic_cost_and_capacity: + variables: + cost: + title: Solar photovoltaic module price + short_unit: '$/W' + unit: '2021 US$ per Watt' + description: | + Global average price of solar photovoltaic modules. + + IRENA presents solar PV module price series for a number of different module technologies. Here we have adopted the series for thin film a-Si/u-Si or Global Index (from Q4 2013). + sources: + - *source-nemet_2009 + - *source-farmer_lafond_2016 + - *source-irena_costs + cost_source: + title: Data source for cost data + unit: '' + description: Source for each value of cost data. + cumulative_capacity: + title: Solar photovoltaic cumulative capacity + description: | + Global cumulative capacity of solar photovoltaics. + short_unit: 'MW' + unit: 'megawatts' + sources: + - *source-nemet_2009 + - *source-irena_capacity + cumulative_capacity_source: + title: Data source for cumulative capacity data + unit: '' + description: Source for each value of cumulative capacity data. diff --git a/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.py b/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.py new file mode 100644 index 00000000000..2e8e91b0c25 --- /dev/null +++ b/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.py @@ -0,0 +1,144 @@ +"""Combine data from Nemet (2009), Farmer & Lafond (2016) and IRENA on photovoltaic cost and capacity. + +Data content: +* Nemet (2009) provides cumulative capacity data between 1975 and 2003. +* Nemet (2009) provides cost data between 1975 and 2003. +* IRENA provides cumulative capacity data between 2000 and 2021. +* IRENA provides cost data between 2010 and 2021. +* Farmer & Lafond (2016) provide cost data between 1980 and 2013. + +For each informed year, we need to combine these sources with the following two constraints: +* Having data from the most recent source. +* Avoid (as much as possible) having cost and capacity data on a given year from different sources. + +Therefore, for capacity data, we use Nemet (2009) between 1975 and 2003, and IRENA between 2004 and 2021. +For cost data, we use Nemet (2009) between 1975 and 2003, Farmer & Lafond (2016) between 2004 and 2009, and IRENA between 2010 and 2021. + +""" + +import pandas as pd +from owid import catalog +from owid.datautils.dataframes import combine_two_overlapping_dataframes + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current data step. +paths = PathFinder(__file__) + +# Conversion factors. +# Convert 2004 USD to 2021 USD. +USD2004_TO_USD2021 = 1.42 +# Convert 2013 USD to 2021 USD. +USD2013_TO_USD2021 = 1.19 + + +def prepare_capacity_data(tb_nemet: catalog.Table, tb_irena_capacity: catalog.Table) -> catalog.Table: + # Column "previous_capacity" is equivalent to tb_nemet["yearly_capacity"].shift(1).cumsum() + # As they explain in the paper, "Following Epple et al. (1991), cumulative capacity is lagged one year to account + # for the time it takes to incorporate new techniques obtained as a result of learning from experience." + tb_nemet_capacity = tb_nemet[["year", "cost", "previous_capacity"]].rename( + columns={"previous_capacity": "cumulative_capacity"}, errors="raise" + )[["year", "cumulative_capacity"]] + # Add column of origin of the data. + tb_nemet_capacity["cumulative_capacity_source"] = "Nemet (2009)" + + # I haven't found a precise definition of the variables in IRENA's dataset, but I expect this to be + # cumulative capacity. + tb_irena_capacity = ( + tb_irena_capacity[tb_irena_capacity["country"] == "World"][["year", "solar_photovoltaic"]] + .rename(columns={"solar_photovoltaic": "cumulative_capacity"}, errors="raise") + .reset_index(drop=True) + ) + tb_irena_capacity["cumulative_capacity_source"] = "IRENA" + + # Combine cumulative capacity from Nemet (2009) and IRENA, prioritising the former on ovelapping years. + cumulative_capacity = ( + combine_two_overlapping_dataframes(df1=tb_nemet_capacity, df2=tb_irena_capacity, index_columns=["year"]) + .astype({"year": int}) + .sort_values("year") + .reset_index(drop=True) + ) + + return cumulative_capacity + + +def prepare_cost_data( + tb_nemet: catalog.Table, tb_irena_cost: catalog.Table, tb_farmer_lafond: catalog.Table +) -> catalog.Table: + # Prepare solar photovoltaic cost data from Nemet (2009). + tb_nemet_cost = tb_nemet[["year", "cost"]].copy() + tb_nemet_cost["cost_source"] = "Nemet (2009)" + # Costs are given in "2004 USD/Watt", so we need to convert them to 2021 USD. + tb_nemet_cost["cost"] *= USD2004_TO_USD2021 + + # Prepare solar photovoltaic cost data from Farmer & Lafond (2016). + tb_farmer_lafond = ( + tb_farmer_lafond[["year", "photovoltaics"]] + .dropna() + .reset_index(drop=True) + .rename(columns={"photovoltaics": "cost"}, errors="raise") + ) + tb_farmer_lafond["cost_source"] = "Farmer & Lafond (2016)" + # Costs are given in "2013 USD/Wp", so we need to convert them to 2021 USD. + tb_farmer_lafond["cost"] *= USD2013_TO_USD2021 + + # Prepare solar photovoltaic cost data from IRENA. + tb_irena_cost = tb_irena_cost.drop(columns="country") + + tb_irena_cost["cost_source"] = "IRENA" + # Costs are given in "2021 USD/W", so we do not need to correct them. + + # Combine Nemet (2009) and Farmer & Lafond (2016), prioritizing the former. + combined = combine_two_overlapping_dataframes(df1=tb_nemet_cost, df2=tb_farmer_lafond, index_columns="year") + + # Combine the previous with IRENA, prioritizing the latter. + combined = combine_two_overlapping_dataframes(df1=tb_irena_cost, df2=combined, index_columns="year") + + return combined + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load Nemet (2009) dataset from Garden. + ds_nemet: catalog.Dataset = paths.load_dependency("nemet_2009") + tb_nemet = ds_nemet["nemet_2009"].reset_index() + + # Load Farmer & Lafond (2016) dataset from Garden. + ds_farmer_lafond: catalog.Dataset = paths.load_dependency("farmer_lafond_2016") + tb_farmer_lafond = ds_farmer_lafond["farmer_lafond_2016"].reset_index() + + # Load IRENA dataset on capacity from Garden. + ds_irena_capacity: catalog.Dataset = paths.load_dependency("renewable_electricity_capacity") + tb_irena_capacity = ds_irena_capacity["renewable_electricity_capacity"].reset_index() + + # Load IRENA dataset on cost from Garden. + ds_irena_cost: catalog.Dataset = paths.load_dependency("renewable_power_generation_costs") + tb_irena_cost = ds_irena_cost["solar_photovoltaic_module_prices"] + + # + # Process data. + # + # Create a table of cumulative solar photovoltaic capacity, by combining Nemet (2009) and IRENA data. + cumulative_capacity = prepare_capacity_data(tb_nemet=tb_nemet, tb_irena_capacity=tb_irena_capacity) + + # Create a table of solar photovoltaic cost, by combining Nemet (2009), Farmer & Lafond (2016) and IRENA data. + cost = prepare_cost_data(tb_nemet=tb_nemet, tb_irena_cost=tb_irena_cost, tb_farmer_lafond=tb_farmer_lafond) + + # Combine capacity and cost data. + tb_combined = pd.merge(cost, cumulative_capacity, on="year", how="outer") + + # Add column for region. + tb_combined = tb_combined.assign(**{"country": "World"}) + + # Set an appropriate index and sort conveniently. + tb_combined = tb_combined.set_index(["country", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new dataset with the same metadata as meadow + tb_combined.metadata.short_name = paths.short_name + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_combined]) + ds_garden.save() diff --git a/etl/steps/data/grapher/energy/2023-07-10/photovoltaic_cost_and_capacity.py b/etl/steps/data/grapher/energy/2023-07-10/photovoltaic_cost_and_capacity.py new file mode 100644 index 00000000000..021c97cfe72 --- /dev/null +++ b/etl/steps/data/grapher/energy/2023-07-10/photovoltaic_cost_and_capacity.py @@ -0,0 +1,23 @@ +"""Load garden dataset of photovoltaic cost and capacity and create a grapher dataset. + +""" + +from owid.catalog import Dataset + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # Load table from garden dataset. + ds_garden: Dataset = paths.load_dependency("photovoltaic_cost_and_capacity") + tb_garden = ds_garden["photovoltaic_cost_and_capacity"] + + # Remove unnecessary columns. + tb_garden = tb_garden.drop(columns=["cost_source", "cumulative_capacity_source"]) + + # Create a new grapher dataset. + dataset = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + dataset.save() From 338f03f0f99e246e54e101c22a193383ff7d78ff Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 16:20:38 +0200 Subject: [PATCH 07/16] Adapt steps of Nemet (2009) and Farmer Lafond (2016) datasets --- .../2023-07-10/farmer_lafond_2016.meta.yml | 406 ++++++++++++++++++ .../papers/2023-07-10/farmer_lafond_2016.py | 104 +++++ .../papers/2023-07-10/nemet_2009.meta.yml | 2 + .../garden/papers/2023-07-10/nemet_2009.py | 26 ++ .../papers/2023-07-10/farmer_lafond_2016.py | 55 +++ .../papers/2023-07-10/farmer_lafond_2016.py | 50 +++ .../meadow/papers/2023-07-10/nemet_2009.py | 42 ++ .../2023-01-04/farmer_lafond_2016.csv.dvc | 2 +- .../papers/2023-01-04/nemet_2009.csv.dvc | 3 +- 9 files changed, 688 insertions(+), 2 deletions(-) create mode 100644 etl/steps/data/garden/papers/2023-07-10/farmer_lafond_2016.meta.yml create mode 100644 etl/steps/data/garden/papers/2023-07-10/farmer_lafond_2016.py create mode 100644 etl/steps/data/garden/papers/2023-07-10/nemet_2009.meta.yml create mode 100644 etl/steps/data/garden/papers/2023-07-10/nemet_2009.py create mode 100644 etl/steps/data/grapher/papers/2023-07-10/farmer_lafond_2016.py create mode 100644 etl/steps/data/meadow/papers/2023-07-10/farmer_lafond_2016.py create mode 100644 etl/steps/data/meadow/papers/2023-07-10/nemet_2009.py diff --git a/etl/steps/data/garden/papers/2023-07-10/farmer_lafond_2016.meta.yml b/etl/steps/data/garden/papers/2023-07-10/farmer_lafond_2016.meta.yml new file mode 100644 index 00000000000..862d725bea2 --- /dev/null +++ b/etl/steps/data/garden/papers/2023-07-10/farmer_lafond_2016.meta.yml @@ -0,0 +1,406 @@ +dataset: + title: Cost of different technologies (Farmer & Lafond (2016), 2023b) + description: | + Cost of each technology, expressed in different units, that have been chosen for visualization purposes, namely: + + Acrylic fiber is measured in 1966 USD/lbs. + + Acrylonitrile is measured in 1966 USD/lbs. + + Aluminum is measured in 1966 USD/lbs. + + Ammonia is measured in 1966 USD/lbs. + + Aniline is measured in 1966 USD/lbs. + + Automotive (US) is measured in Gallons/Mile. + + Beer (Japan) is measured in 1955 Yen. + + Benzene is measured in 1958 USD. + + Bisphenol A is measured in 1966 USD/lbs. + + Caprolactam is measured in 1966 USD/lbs. + + Carbon black is measured in 1966 USD/lbs. + + Carbon disulfide is measured in 1966 USD/lbs. + + CCGT power is measured in 1990 USD/kW. + + Concentrating solar is measured in US cents/kWh. + + Corn (US) is measured in acres/1000 bushels. + + Crude oil is measured in 1958 USD. + + Cyclohexane is measured in 1966 USD/lbs. + + DNA sequencing is measured in 2013 USD/human-size genome. + + DRAM is measured in 2005 USD/thousand bits. + + Electric range is measured in 1958 USD. + + Ethanol (Brazil) is measured in 2002 USD/GJ. + + Ethanolamine is measured in 1966 USD/lbs. + + Ethylene is measured in 1966 USD/lbs. + + Formaldehyde is measured in 1966 USD/lbs. + + Free standing gas range is measured in 1958 USD. + + Geothermal electricity is measured in 2005 US cents/kWh. + + Hard disk drive is measured in 2005 USD/megabyte. + + Hydrofluoric acid is measured in 1966 USD/lbs. + + Isopropyl alcohol is measured in 1966 USD/lbs. + + Laser diode is measured in Yen. + + Low-density polyethylene is measured in 1958 USD/pound. + + Magnesium is measured in 1966 USD/lbs. + + Maleic anhydride is measured in 1966 USD/lbs. + + Methanol is measured in 1966 USD/lbs. + + Milk (US) is measured in Heads/Mil.lbs. + + Monochrome television is measured in 1958 USD per unit. + + Motor gasoline is measured in 1958 USD/Gallon. + + Neoprene rubber is measured in 1966 USD/lbs. + + Nuclear electricity is measured in 2004 USD/Watt. + + Onshore gas pipeline is measured in dollar/mile-inch. + + Paraxylene is measured in 1958 USD. + + Pentaerythritol is measured in 1966 USD/lbs. + + Phenol is measured in 1966 USD/lbs. + + Photovoltaics is measured in 2013 USD/Wp. + + Phthalic anhydride is measured in 1966 USD/lbs. + + Polyester fiber is measured in 1966 USD/lbs. + + Polyethylene HD is measured in 1966 USD/lbs. + + Polyethylene LD is measured in 1966 USD/lbs. + + Polypropylene is measured in 1958 USD/pound. + + Polystyrene is measured in 1958 USD/pound. + + Polyvinylchloride is measured in 1958 USD/pound. + + Primary aluminum is measured in 1958 USD/pound. + + Primary magnesium is measured in 1958 USD/pound. + + Refined cane sugar is measured in 1958 USD. + + Sodium is measured in 1966 USD/lbs. + + Sodium chlorate is measured in 1966 USD/lbs. + + Sodium hydrosulfite is measured in 1966 USD/lbs. + + Sorbitol is measured in 1966 USD/lbs. + + Styrene is measured in 1966 USD/lbs. + + Titanium sponge is measured in 1958 USD/lbs. + + Titanium dioxide is measured in 1966 USD/lbs. + + Transistor is measured in 2005 USD. + + Urea is measured in 1966 USD/lbs. + + Vinyl acetate is measured in 1966 USD/lbs. + + Vinyl chloride is measured in 1966 USD/lbs. + + Wind turbine (Denmark) is measured in DKK/kW. + + According to Farmer & Lafond (2016), the data are mostly taken from the Santa-Fe Performance Curve DataBase, accessible at pcdb.santafe.edu. The database has been constructed from personal communications and from Colpier and Cornland (2002), Goldemberg et al. (2004), Lieberman (1984), Lipman and Sperling (1999), Zhao (1999), McDonald and Schrattenholzer (2001), Neij et al. (2003), Moore (2006), Nemet (2006), Schilling and Esmundo (2009). The data on photovoltaic prices has been collected from public releases of Strategies Unlimited, Navigant and SPV Market Research. The data on nuclear energy is from Koomey and Hultman (2007) and Cooper (2009). The DNA sequencing data is from Wetterstrand (2015) (cost per human-size genome), and for each year the last available month (September for 2001-2002 and October afterwards) was taken and corrected for inflation using the US GDP deflator. + +tables: + farmer_lafond_2016: + variables: + acrylic_fiber: + title: Acrylic fiber + description: Cost of acrylic fiber, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + acrylonitrile: + title: Acrylonitrile + description: Cost of acrylonitrile, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + aluminum: + title: Aluminum + description: Cost of aluminum, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + ammonia: + title: Ammonia + description: Cost of ammonia, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + aniline: + title: Aniline + description: Cost of aniline, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + automotive_us: + title: Automotive (US) + description: Cost of automotive (US), measured in Gallons/Mile. + unit: Gallons/Mile + short_unit: '' + beer_japan: + title: Beer (Japan) + description: Cost of beer (Japan), measured in 1955 Yen. + unit: 1955 Yen + short_unit: '' + benzene: + title: Benzene + description: Cost of benzene, measured in 1958 USD. + unit: 1958 USD + short_unit: '' + bisphenol_a: + title: Bisphenol A + description: Cost of bisphenol A, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + caprolactam: + title: Caprolactam + description: Cost of caprolactam, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + carbon_black: + title: Carbon black + description: Cost of carbon black, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + carbon_disulfide: + title: Carbon disulfide + description: Cost of carbon disulfide, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + ccgt_power: + title: CCGT power + description: Cost of cCGT power, measured in 1990 USD/kW. + unit: 1990 USD/kW + short_unit: '' + concentrating_solar: + title: Concentrating solar + description: Cost of concentrating solar, measured in US cents/kWh. + unit: US cents/kWh + short_unit: '' + corn_us: + title: Corn (US) + description: Cost of corn (US), measured in acres/1000 bushels. + unit: acres/1000 bushels + short_unit: '' + crude_oil: + title: Crude oil + description: Cost of crude oil, measured in 1958 USD. + unit: 1958 USD + short_unit: '' + cyclohexane: + title: Cyclohexane + description: Cost of cyclohexane, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + dna_sequencing: + title: DNA sequencing + description: Cost of dNA sequencing, measured in 2013 USD/human-size genome. + unit: 2013 USD/human-size genome + short_unit: '' + dram: + title: DRAM + description: Cost of dRAM, measured in 2005 USD/thousand bits. + unit: 2005 USD/thousand bits + short_unit: '' + electric_range: + title: Electric range + description: Cost of electric range, measured in 1958 USD. + unit: 1958 USD + short_unit: '' + ethanol_brazil: + title: Ethanol (Brazil) + description: Cost of ethanol (Brazil), measured in 2002 USD/GJ. + unit: 2002 USD/GJ + short_unit: '' + ethanolamine: + title: Ethanolamine + description: Cost of ethanolamine, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + ethylene: + title: Ethylene + description: Cost of ethylene, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + formaldehyde: + title: Formaldehyde + description: Cost of formaldehyde, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + free_standing_gas_range: + title: Free standing gas range + description: Cost of free standing gas range, measured in 1958 USD. + unit: 1958 USD + short_unit: '' + geothermal_electricity: + title: Geothermal electricity + description: Cost of geothermal electricity, measured in 2005 US cents/kWh. + unit: 2005 US cents/kWh + short_unit: '' + hard_disk_drive: + title: Hard disk drive + description: Cost of hard disk drive, measured in 2005 USD/megabyte. + unit: 2005 USD/megabyte + short_unit: '' + hydrofluoric_acid: + title: Hydrofluoric acid + description: Cost of hydrofluoric acid, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + isopropyl_alcohol: + title: Isopropyl alcohol + description: Cost of isopropyl alcohol, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + laser_diode: + title: Laser diode + description: Cost of laser diode, measured in Yen. + unit: Yen + short_unit: '' + low_density_polyethylene: + title: Low-density polyethylene + description: Cost of low-density polyethylene, measured in 1958 USD/pound. + unit: 1958 USD/pound + short_unit: '' + magnesium: + title: Magnesium + description: Cost of magnesium, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + maleic_anhydride: + title: Maleic anhydride + description: Cost of maleic anhydride, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + methanol: + title: Methanol + description: Cost of methanol, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + milk_us: + title: Milk (US) + description: Cost of milk (US), measured in Heads/Mil.lbs. + unit: Heads/Mil.lbs + short_unit: '' + monochrome_television: + title: Monochrome television + description: Cost of monochrome television, measured in 1958 USD per unit. + unit: 1958 USD per unit + short_unit: '' + motor_gasoline: + title: Motor gasoline + description: Cost of motor gasoline, measured in 1958 USD/Gallon. + unit: 1958 USD/Gallon + short_unit: '' + neoprene_rubber: + title: Neoprene rubber + description: Cost of neoprene rubber, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + nuclear_electricity: + title: Nuclear electricity + description: Cost of nuclear electricity, measured in 2004 USD/Watt. + unit: 2004 USD/Watt + short_unit: '' + onshore_gas_pipeline: + title: Onshore gas pipeline + description: Cost of onshore gas pipeline, measured in dollar/mile-inch. + unit: dollar/mile-inch + short_unit: '' + paraxylene: + title: Paraxylene + description: Cost of paraxylene, measured in 1958 USD. + unit: 1958 USD + short_unit: '' + pentaerythritol: + title: Pentaerythritol + description: Cost of pentaerythritol, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + phenol: + title: Phenol + description: Cost of phenol, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + photovoltaics: + title: Photovoltaics + description: Cost of photovoltaics, measured in 2013 USD/Wp. + unit: 2013 USD/Wp + short_unit: '' + phthalic_anhydride: + title: Phthalic anhydride + description: Cost of phthalic anhydride, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + polyester_fiber: + title: Polyester fiber + description: Cost of polyester fiber, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + polyethylene_hd: + title: Polyethylene HD + description: Cost of polyethylene HD, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + polyethylene_ld: + title: Polyethylene LD + description: Cost of polyethylene LD, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + polypropylene: + title: Polypropylene + description: Cost of polypropylene, measured in 1958 USD/pound. + unit: 1958 USD/pound + short_unit: '' + polystyrene: + title: Polystyrene + description: Cost of polystyrene, measured in 1958 USD/pound. + unit: 1958 USD/pound + short_unit: '' + polyvinylchloride: + title: Polyvinylchloride + description: Cost of polyvinylchloride, measured in 1958 USD/pound. + unit: 1958 USD/pound + short_unit: '' + primary_aluminum: + title: Primary aluminum + description: Cost of primary aluminum, measured in 1958 USD/pound. + unit: 1958 USD/pound + short_unit: '' + primary_magnesium: + title: Primary magnesium + description: Cost of primary magnesium, measured in 1958 USD/pound. + unit: 1958 USD/pound + short_unit: '' + refined_cane_sugar: + title: Refined cane sugar + description: Cost of refined cane sugar, measured in 1958 USD. + unit: 1958 USD + short_unit: '' + sodium: + title: Sodium + description: Cost of sodium, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + sodium_chlorate: + title: Sodium chlorate + description: Cost of sodium chlorate, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + sodium_hydrosulfite: + title: Sodium hydrosulfite + description: Cost of sodium hydrosulfite, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + sorbitol: + title: Sorbitol + description: Cost of sorbitol, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + styrene: + title: Styrene + description: Cost of styrene, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + titanium_sponge: + title: Titanium sponge + description: Cost of titanium sponge, measured in 1958 USD/lbs. + unit: 1958 USD/lbs + short_unit: '' + titanium_dioxide: + title: Titanium dioxide + description: Cost of titanium dioxide, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + transistor: + title: Transistor + description: Cost of transistor, measured in 2005 USD. + unit: 2005 USD + short_unit: '' + urea: + title: Urea + description: Cost of urea, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + vinyl_acetate: + title: Vinyl acetate + description: Cost of vinyl acetate, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + vinyl_chloride: + title: Vinyl chloride + description: Cost of vinyl chloride, measured in 1966 USD/lbs. + unit: 1966 USD/lbs + short_unit: '' + wind_turbine_denmark: + title: Wind turbine (Denmark) + description: Cost of wind turbine (Denmark), measured in DKK/kW. + unit: DKK/kW + short_unit: '' diff --git a/etl/steps/data/garden/papers/2023-07-10/farmer_lafond_2016.py b/etl/steps/data/garden/papers/2023-07-10/farmer_lafond_2016.py new file mode 100644 index 00000000000..d090e85c25e --- /dev/null +++ b/etl/steps/data/garden/papers/2023-07-10/farmer_lafond_2016.py @@ -0,0 +1,104 @@ +"""Harmonize data from Farmer & Lafond (2016) paper on the evolution of the cost of different technologies. + +""" + +from owid.catalog import Dataset + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current data step. +paths = PathFinder(__file__) + +# Columns to select from Meadow table, and how to rename them. +COLUMNS = { + "acrylicfiber": "acrylic_fiber", + "acrylonitrile": "acrylonitrile", + "aluminum": "aluminum", + "ammonia": "ammonia", + "aniline": "aniline", + "automotive__us": "automotive_us", + "beer__japan": "beer_japan", + "benzene": "benzene", + "bisphenola": "bisphenol_a", + "caprolactam": "caprolactam", + "carbonblack": "carbon_black", + "carbondisulfide": "carbon_disulfide", + "ccgt_power": "ccgt_power", + "concentrating_solar": "concentrating_solar", + "corn__us": "corn_us", + "crude_oil": "crude_oil", + "cyclohexane": "cyclohexane", + "dna_sequencing": "dna_sequencing", + "dram": "dram", + "electric_range": "electric_range", + "ethanol__brazil": "ethanol_brazil", + "ethanolamine": "ethanolamine", + "ethylene": "ethylene", + "formaldehyde": "formaldehyde", + "free_standing_gas_range": "free_standing_gas_range", + "geothermal_electricity": "geothermal_electricity", + "hard_disk_drive": "hard_disk_drive", + "hydrofluoricacid": "hydrofluoric_acid", + "isopropylalcohol": "isopropyl_alcohol", + "laser_diode": "laser_diode", + "low_density_polyethylene": "low_density_polyethylene", + "magnesium": "magnesium", + "maleicanhydride": "maleic_anhydride", + "methanol": "methanol", + "milk__us": "milk_us", + "monochrome_television": "monochrome_television", + "motor_gasoline": "motor_gasoline", + "neoprenerubber": "neoprene_rubber", + "nuclear_electricity": "nuclear_electricity", + "onshore_gas_pipeline": "onshore_gas_pipeline", + "paraxylene": "paraxylene", + "pentaerythritol": "pentaerythritol", + "phenol": "phenol", + "photovoltaics": "photovoltaics", + "phthalicanhydride": "phthalic_anhydride", + "polyesterfiber": "polyester_fiber", + "polyethylenehd": "polyethylene_hd", + "polyethyleneld": "polyethylene_ld", + "polypropylene": "polypropylene", + "polystyrene": "polystyrene", + "polyvinylchloride": "polyvinylchloride", + "primary_aluminum": "primary_aluminum", + "primary_magnesium": "primary_magnesium", + "refined_cane_sugar": "refined_cane_sugar", + "sodium": "sodium", + "sodiumchlorate": "sodium_chlorate", + "sodiumhydrosulfite": "sodium_hydrosulfite", + "sorbitol": "sorbitol", + "styrene": "styrene", + "titanium_sponge": "titanium_sponge", + "titanium_dioxide": "titanium_dioxide", + "transistor": "transistor", + "urea": "urea", + "vinylacetate": "vinyl_acetate", + "vinylchloride": "vinyl_chloride", + "wind_turbine__denmark": "wind_turbine_denmark", +} + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load dataset from meadow and read its main table. + ds_meadow: Dataset = paths.load_dependency("farmer_lafond_2016") + tb_meadow = ds_meadow["farmer_lafond_2016"] + + # + # Process data. + # + # Rename technologies conveniently (both in column names and in metadata). + tb_garden = tb_meadow.rename(columns=COLUMNS, errors="raise")[COLUMNS.values()].copy() + + # + # Save outputs. + # + # Create a new dataset with the same metadata as meadow + ds_garden = create_dataset( + dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_meadow.metadata, check_variables_metadata=True + ) + ds_garden.save() diff --git a/etl/steps/data/garden/papers/2023-07-10/nemet_2009.meta.yml b/etl/steps/data/garden/papers/2023-07-10/nemet_2009.meta.yml new file mode 100644 index 00000000000..0b528950364 --- /dev/null +++ b/etl/steps/data/garden/papers/2023-07-10/nemet_2009.meta.yml @@ -0,0 +1,2 @@ +dataset: + title: Cost and capacity of photovoltaic technology (Nemet, 2009) diff --git a/etl/steps/data/garden/papers/2023-07-10/nemet_2009.py b/etl/steps/data/garden/papers/2023-07-10/nemet_2009.py new file mode 100644 index 00000000000..3d53d514a77 --- /dev/null +++ b/etl/steps/data/garden/papers/2023-07-10/nemet_2009.py @@ -0,0 +1,26 @@ +"""Harmonize data from Nemet (2009) paper on cost and capacity of photovoltaic energy. + +""" + +from owid.catalog import Dataset + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current data step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load dataset from meadow and read its main table. + ds_meadow: Dataset = paths.load_dependency("nemet_2009") + tb_meadow = ds_meadow["nemet_2009"] + + # + # Save outputs. + # + # Create a new dataset with the same metadata as meadow + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_meadow], default_metadata=ds_meadow.metadata) + ds_garden.save() diff --git a/etl/steps/data/grapher/papers/2023-07-10/farmer_lafond_2016.py b/etl/steps/data/grapher/papers/2023-07-10/farmer_lafond_2016.py new file mode 100644 index 00000000000..78b5ce5932c --- /dev/null +++ b/etl/steps/data/grapher/papers/2023-07-10/farmer_lafond_2016.py @@ -0,0 +1,55 @@ +"""Load garden dataset for Farmer & Lafond (2016) data and create a grapher dataset. + +""" + +from owid.catalog import Dataset + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current data step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load garden dataset and read its main table. + ds_garden: Dataset = paths.load_dependency("farmer_lafond_2016") + tb_garden = ds_garden["farmer_lafond_2016"] + + # + # Process data. + # + # Replace snake-case names by the original technology names. + tb_garden = tb_garden.rename(columns={column: tb_garden[column].metadata.title for column in tb_garden.columns}) + + # For better visualization, divide the costs of DNA sequencing by 1000, as done in the original paper by Farmer & Lafond (2016). + tb_garden["DNA sequencing"] /= 1000 + + # Remove units from each of the columns (that will be all put together in the same column). + for column in tb_garden.columns: + tb_garden[column].metadata.unit = None + + # Convert table to long format, and rename column so that it can be treated as a country in grapher. + # This way, we can select technologies as we usually do with countries. + tb_garden = ( + tb_garden.reset_index() + .melt(id_vars="year", var_name="country", value_name="cost") + .dropna() + .reset_index(drop=True) + ) + tb_garden["cost"].metadata.title = "Technology cost" + tb_garden["cost"].metadata.unit = "various units" + + # Set an appropriate index and sort conveniently. + tb_garden = tb_garden.set_index(["year", "country"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new grapher dataset. + dataset = create_dataset( + dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata, check_variables_metadata=True + ) + dataset.save() diff --git a/etl/steps/data/meadow/papers/2023-07-10/farmer_lafond_2016.py b/etl/steps/data/meadow/papers/2023-07-10/farmer_lafond_2016.py new file mode 100644 index 00000000000..91a47a7025e --- /dev/null +++ b/etl/steps/data/meadow/papers/2023-07-10/farmer_lafond_2016.py @@ -0,0 +1,50 @@ +"""Load snapshot of Farmer & Lafond (2016) data and create a table. + +""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current data step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load snapshot. + snap = paths.load_dependency("farmer_lafond_2016.csv") + tb = pr.read_csv(snap.path, metadata=snap.to_table_metadata()) + + # + # Prepare data. + # + # Store the unit of each technology cost from the zeroth row. + units = dict(zip(tb.columns.tolist()[1:], tb.loc[0][1:])) + + # The zeroth row will be added as metadata, and the first row is not useful, so drop both. + tb = tb.drop(index=[0, 1]).reset_index(drop=True) + + # Rename year column and make it integer. + tb = tb.rename(columns={"YEAR": "year"}).astype({"year": int}) + + # Add title, units and description to metadata. + for column in tb.drop(columns=["year"]).columns: + tb[column].metadata.title = column + tb[column].metadata.unit = units[column] + tb[column].metadata.description = f"Cost for {column}, measured in {units[column]}." + + # Ensure all columns are snake-case. + tb = tb.underscore() + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset and reuse snapshot metadata. + ds = create_dataset(dest_dir=dest_dir, tables=[tb], default_metadata=snap.metadata, check_variables_metadata=True) + ds.save() diff --git a/etl/steps/data/meadow/papers/2023-07-10/nemet_2009.py b/etl/steps/data/meadow/papers/2023-07-10/nemet_2009.py new file mode 100644 index 00000000000..4aac56472d2 --- /dev/null +++ b/etl/steps/data/meadow/papers/2023-07-10/nemet_2009.py @@ -0,0 +1,42 @@ +"""Load snapshot of Nemet (2009) data and create a table. + +""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current data step. +paths = PathFinder(__file__) + +# Columns to select from snapshot, and how to rename them. +COLUMNS = { + "Cost (2004 USD/Watt)": "cost", + "Time (Year)": "year", + "Yearly Capacity (MW)": "yearly_capacity", + "Previous Capacity (MW)": "previous_capacity", +} + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load snapshot. + snap = paths.load_dependency("nemet_2009.csv") + tb = pr.read_csv(snap.path, metadata=snap.to_table_metadata()) + + # + # Process data. + # + tb = tb.rename(columns=COLUMNS, errors="raise")[COLUMNS.values()] + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset and reuse snapshot metadata. + ds = create_dataset(dest_dir=dest_dir, tables=[tb], default_metadata=snap.metadata) + ds.save() diff --git a/snapshots/papers/2023-01-04/farmer_lafond_2016.csv.dvc b/snapshots/papers/2023-01-04/farmer_lafond_2016.csv.dvc index 2f3428cab04..0928a8f1230 100644 --- a/snapshots/papers/2023-01-04/farmer_lafond_2016.csv.dvc +++ b/snapshots/papers/2023-01-04/farmer_lafond_2016.csv.dvc @@ -3,6 +3,7 @@ meta: short_name: farmer_lafond_2016 name: How predictable is technological progress? source_name: J. D. Farmer & F. Lafond (2016) + source_published_by: How predictable is technological progress? J. D. Farmer & F. Lafond, Research Policy Volume 45, Issue 3, April 2016, Pages 647-665 publication_year: 2016 publication_date: 2016-04-01 url: https://www.sciencedirect.com/science/article/pii/S0048733315001699 @@ -12,7 +13,6 @@ meta: date_accessed: 2023-01-04 is_public: true description: | - J. D. Farmer & F. Lafond, Research Policy Volume 45, Issue 3, April 2016, Pages 647-665. wdir: ../../../data/snapshots/papers/2023-01-04 outs: - md5: b2dd2d2d7bdf788da15bd2e7dba8aaa9 diff --git a/snapshots/papers/2023-01-04/nemet_2009.csv.dvc b/snapshots/papers/2023-01-04/nemet_2009.csv.dvc index 394949fd394..0f504422eb0 100644 --- a/snapshots/papers/2023-01-04/nemet_2009.csv.dvc +++ b/snapshots/papers/2023-01-04/nemet_2009.csv.dvc @@ -3,6 +3,8 @@ meta: short_name: nemet_2009 name: Interim monitoring of cost dynamics for publicly supported energy technologies source_name: G. F. Nemet (2009) via Performance Curve Database + source_published_by: | + Interim monitoring of cost dynamics for publicly supported energy technologies, Energy Policy 37(3): 825-835, by Nemet, G. F. (2009), obtained via the Performance Curve Database by http://pcdb.santafe.edu publication_year: 2009 publication_date: 2009-03-01 url: https://pcdb.santafe.edu/graph.php?curve=158 @@ -13,7 +15,6 @@ meta: date_accessed: 2023-01-04 is_public: true description: | - Energy Policy 37(3): 825-835. by Nemet, G. F (2009). Obtained via the Performance Curve Database by http://pcdb.santafe.edu. wdir: ../../../data/snapshots/papers/2023-01-04 outs: - md5: 3ad0ba878a3f06addac0d6bc75c92b74 From 97ddd7eb71d38e76db416c31d10debd5ce990035 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 16:22:07 +0200 Subject: [PATCH 08/16] Adapt old IRENA steps --- ...able_power_generation_costs.countries.json | 23 ++ .../renewable_power_generation_costs.meta.yml | 44 +++ .../renewable_power_generation_costs.py | 39 +++ .../renewable_power_generation_costs.py | 33 +++ .../renewable_power_generation_costs.py | 259 ++++++++++++++++++ .../renewable_power_generation_costs.py | 32 +++ .../renewable_power_generation_costs.xlsx.dvc | 21 ++ 7 files changed, 451 insertions(+) create mode 100644 etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.countries.json create mode 100644 etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.meta.yml create mode 100644 etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.py create mode 100644 etl/steps/data/grapher/irena/2023-06-26/renewable_power_generation_costs.py create mode 100644 etl/steps/data/meadow/irena/2023-06-26/renewable_power_generation_costs.py create mode 100644 snapshots/irena/2023-06-26/renewable_power_generation_costs.py create mode 100644 snapshots/irena/2023-06-26/renewable_power_generation_costs.xlsx.dvc diff --git a/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.countries.json b/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.countries.json new file mode 100644 index 00000000000..2b933e7a13a --- /dev/null +++ b/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.countries.json @@ -0,0 +1,23 @@ +{ + "Australia": "Australia", + "Brazil": "Brazil", + "Canada": "Canada", + "China": "China", + "Denmark": "Denmark", + "France": "France", + "Germany": "Germany", + "India": "India", + "Italy": "Italy", + "Japan": "Japan", + "Mexico": "Mexico", + "Netherlands": "Netherlands", + "Republic of Korea": "South Korea", + "Spain": "Spain", + "Sweden": "Sweden", + "Ukraine": "Ukraine", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Viet Nam": "Vietnam", + "World": "World", + "T\u00fcrkiye": "Turkey" + } diff --git a/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.meta.yml b/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.meta.yml new file mode 100644 index 00000000000..7b4795ca147 --- /dev/null +++ b/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.meta.yml @@ -0,0 +1,44 @@ +dataset: + title: Renewable power generation costs (IRENA, 2023b) + description: | + Levelized cost of energy (LCOE) estimates the average cost per unit of energy generated across the lifetime of a new power plant. It is measured in 2021 US$ per kilowatt-hour. + +tables: + renewable_power_generation_costs: + variables: + bioenergy: + title: Bioenergy levelized cost of energy + short_unit: $/kWh + unit: 2021 US$ per kilowatt-hour + concentrated_solar_power: + title: Concentrated solar power levelized cost of energy + short_unit: $/kWh + unit: 2021 US$ per kilowatt-hour + geothermal: + title: Geothermal levelized cost of energy + short_unit: $/kWh + unit: 2021 US$ per kilowatt-hour + hydropower: + title: Hydropower levelized cost of energy + short_unit: $/kWh + unit: 2021 US$ per kilowatt-hour + offshore_wind: + title: Offshore wind levelized cost of energy + short_unit: $/kWh + unit: 2021 US$ per kilowatt-hour + onshore_wind: + title: Onshore wind levelized cost of energy + short_unit: $/kWh + unit: 2021 US$ per kilowatt-hour + solar_photovoltaic: + title: Solar photovoltaic levelized cost of energy + short_unit: $/kWh + unit: 2021 US$ per kilowatt-hour + solar_photovoltaic_module_prices: + variables: + cost: + title: Solar photovoltaic module prices + short_unit: "$/W" + unit: "2021 US$ per Watt" + description: | + Global average price of solar photovoltaic modules. diff --git a/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.py b/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.py new file mode 100644 index 00000000000..cd66d5e69e1 --- /dev/null +++ b/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.py @@ -0,0 +1,39 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from typing import cast + +from owid.catalog import Dataset, Table + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = cast(Dataset, paths.load_dependency("renewable_power_generation_costs")) + tb = ds_meadow["renewable_power_generation_costs"].reset_index() + tb_solar_pv = ds_meadow["solar_photovoltaic_module_prices"] + + # + # Process data. + # + # Harmonize country names. + tb: Table = geo.harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + ) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb, tb_solar_pv], default_metadata=ds_meadow.metadata) + ds_garden.save() diff --git a/etl/steps/data/grapher/irena/2023-06-26/renewable_power_generation_costs.py b/etl/steps/data/grapher/irena/2023-06-26/renewable_power_generation_costs.py new file mode 100644 index 00000000000..ce63daebb3e --- /dev/null +++ b/etl/steps/data/grapher/irena/2023-06-26/renewable_power_generation_costs.py @@ -0,0 +1,33 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from typing import cast + +from owid.catalog import Dataset + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = cast(Dataset, paths.load_dependency("renewable_power_generation_costs")) + tb = ds_garden["renewable_power_generation_costs"] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/irena/2023-06-26/renewable_power_generation_costs.py b/etl/steps/data/meadow/irena/2023-06-26/renewable_power_generation_costs.py new file mode 100644 index 00000000000..fa257c0018d --- /dev/null +++ b/etl/steps/data/meadow/irena/2023-06-26/renewable_power_generation_costs.py @@ -0,0 +1,259 @@ +"""Extract global (as well as at the country level for some countries) weighted-average levelized cost of electricity +(LCOE) for all energy sources from IRENA's Renewable Power Generation Costs 2022 dataset. + +Extract solar photovoltaic module prices too. + +NOTE: The original data is poorly formatted. Each energy source is given as a separate sheet, with a different +structure. So it's likely that, on the next update, this script will not work. + +""" + +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table, TableMeta + +from etl.helpers import PathFinder, create_dataset +from etl.snapshot import Snapshot + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def prepare_solar_pv_module_prices(excel_object: pr.ExcelFile, metadata: TableMeta) -> Table: + """Prepare yearly data on average solar photovoltaic module prices. + + Monthly data will be averaged, and only complete years (with 12 informed months) will be considered. + + Parameters + ---------- + data_file : str + Path to raw data (IRENA's excel file on renewable power generation costs). + + Returns + ------- + pv_prices : Table + PV prices. + + """ + # Photovoltaic technologies to choose for average monthly prices. + pv_technologies = ["Thin film a-Si/u-Si or Global Index (from Q4 2013)"] + + # Load upper table in sheet from Figure 3.2, which is: + # Average monthly solar PV module prices by technology and manufacturing country sold in Europe, 2010 to 2021. + pv_prices = excel_object.parse( + sheet_name="Fig 3.2", + skiprows=4, + skipfooter=18, + usecols=lambda column: "Unnamed" not in column, + metadata=metadata, + ) + + # Rename table. + pv_prices.metadata.short_name = "solar_photovoltaic_module_prices" + + # Transpose dataframe so that each row corresponds to a month. + pv_prices = pv_prices.rename(columns={"2021 USD/W": "technology"}).melt( + id_vars="technology", var_name="month", value_name="cost" + ) + + # Select PV technologies. + pv_prices = pv_prices[pv_prices["technology"].isin(pv_technologies)].reset_index(drop=True) + + # Get year from dates. + pv_prices["year"] = pd.to_datetime(pv_prices["month"], format="%b %y").dt.year + + # For each year get the average cost over all months. + # NOTE: The following operation does not properly propagate metadata. Do it manually. + _pv_prices = pv_prices.copy() + pv_prices = ( + pv_prices.groupby(["technology", "year"]) + .agg({"cost": "mean", "year": "count"}) + .rename(columns={"year": "n_months"}) + .reset_index() + ).copy_metadata(_pv_prices) + + # Remove unnecessary column and add column for region. + pv_prices = pv_prices.drop(columns="technology").assign(**{"country": "World"}) + + # Ignore years for which we don't have 12 months. + pv_prices = pv_prices[pv_prices["n_months"] == 12].drop(columns=["n_months"]).reset_index(drop=True) + + # Set an appropriate index and sort conveniently. + pv_prices = pv_prices.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + return pv_prices + + +def extract_global_cost_for_all_sources_from_excel_file(excel_object: pr.ExcelFile, metadata: TableMeta) -> Table: + """Extract global weighted-average LCOE of all energy sources from the excel file. + + Each energy source is given in a separate sheet, in a different way, to each needs a different treatment. + + Parameters + ---------- + local_file : str + Path to excel file with raw data. + + Returns + ------- + tb : Table + LCOE for different energy sources. + """ + # Extract weighted average LCOE for different sources (each one requires a slightly different processing): + + # Solar photovoltaic. + solar_pv = ( + excel_object.parse("Fig 3.1", skiprows=22, metadata=metadata).dropna(how="all", axis=1).rename(columns={"Unnamed: 1": "temp"}) # type: ignore + ) + solar_pv = solar_pv[solar_pv["temp"] == "Weighted average"].melt( + id_vars="temp", var_name="year", value_name="cost" + )[["year", "cost"]] + solar_pv["technology"] = "Solar photovoltaic" + + # Onshore wind. + onshore_wind = excel_object.parse("Fig 2.12", skiprows=3, usecols=lambda column: "Unnamed" not in column, metadata=metadata).rename( # type: ignore + columns={"Year": "year", "Weighted average": "cost"} + ) + onshore_wind["technology"] = "Onshore wind" + + # Concentrated solar power. + csp = excel_object.parse("Fig 5.7", skiprows=4, metadata=metadata).dropna(how="all", axis=1) # type: ignore + csp = ( + csp[csp["2021 USD/kWh"] == "Weighted average"] + .melt(id_vars="2021 USD/kWh", var_name="year", value_name="cost")[["year", "cost"]] + .reset_index(drop=True) + ) + csp["technology"] = "Concentrated solar power" + + # Offshore wind. + offshore_wind = excel_object.parse("Fig 4.13", skiprows=3, metadata=metadata).rename( # type: ignore + columns={"Year": "year", "Weighted average": "cost"} + )[["year", "cost"]] + offshore_wind["technology"] = "Offshore wind" + + # Geothermal. + geothermal = excel_object.parse("Fig 7.4", skiprows=5, metadata=metadata).rename(columns={"Year": "year", "Weighted average": "cost"})[ # type: ignore + ["year", "cost"] + ] + geothermal["technology"] = "Geothermal" + + # Bioenergy. + bioenergy = ( + excel_object.parse("Fig 8.1", skiprows=20, metadata=metadata).dropna(axis=1, how="all").rename(columns={"Unnamed: 1": "temp"}) # type: ignore + ) + bioenergy = bioenergy[bioenergy["temp"] == "Weighted average"].melt( + id_vars="temp", var_name="year", value_name="cost" + )[["year", "cost"]] + bioenergy["technology"] = "Bioenergy" + + # Hydropower. + hydropower = ( + excel_object.parse("Fig 6.1", skiprows=20, metadata=metadata).dropna(how="all", axis=1).rename(columns={"Unnamed: 1": "temp"}) # type: ignore + ) + hydropower = hydropower[hydropower["temp"] == "Weighted average"].melt( + id_vars="temp", var_name="year", value_name="cost" + )[["year", "cost"]] + hydropower["technology"] = "Hydropower" + + # Concatenate all sources into one dataframe. + tb = pr.concat([solar_pv, onshore_wind, csp, offshore_wind, geothermal, bioenergy, hydropower], ignore_index=True) + + # Add country column. + tb["country"] = "World" + + return tb + + +def extract_country_cost_from_excel_file(excel_object: pr.ExcelFile, metadata: TableMeta) -> Table: + """Extract weighted-average LCOE of certain countries and certain energy sources from the excel file. + + Only onshore wind and solar photovoltaic seem to have this data, and only for specific countries. + + Parameters + ---------- + local_file : str + Path to excel file with raw data. + + Returns + ------- + tb : Table + LCOE for different energy sources. + """ + # Extract LCOE for specific countries and technologies (those that are available in original data). + + # Solar photovoltaic. + solar_pv = ( + excel_object.parse("Fig 3.8", skiprows=5, metadata=metadata).dropna(how="all", axis=1).rename(columns={"2021 USD/kWh": "country"}) # type: ignore + ) + + # Last column is the difference between the cost in the last two years. Remove that column. + solar_pv = solar_pv.drop(columns="2020-2021") + + # Onshore wind. + onshore_wind = ( + excel_object.parse("Fig 2.13", skiprows=6, metadata=metadata).dropna(how="all", axis=1).rename(columns={"Country": "country"}) # type: ignore + ) + + # Country column is repeated. Drop it, and drop column of percentage decrease. + onshore_wind = onshore_wind.drop(columns=["Country.1", "% decrease "]) + + # Add a technology column and concatenate different technologies. + solar_pv["technology"] = "Solar photovoltaic" + onshore_wind["technology"] = "Onshore wind" + combined = pr.concat([solar_pv, onshore_wind], ignore_index=True) + + # Rearrange dataframe to have year as a column. + combined = combined.melt(id_vars=["technology", "country"], var_name="year", value_name="cost") + + return combined + + +def combine_global_and_national_data(tb_costs_global: Table, tb_costs_national: Table) -> Table: + # Combine global and national data. + tb_combined = pr.concat([tb_costs_global, tb_costs_national], ignore_index=True).astype({"year": int}) + + # Convert from long to wide format. + tb_combined = tb_combined.pivot(index=["country", "year"], columns="technology", values="cost").reset_index() + + # Remove name of dummy index. + tb_combined.columns.names = [None] + + # Underscore column names. + tb_combined = tb_combined.underscore() + + # Set an appropriate index and sort conveniently. + tb_combined = tb_combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + return tb_combined + + +def run(dest_dir: str) -> None: + # Retrieve snapshot. + snap: Snapshot = paths.load_dependency("renewable_power_generation_costs.xlsx") + data = pr.ExcelFile(snap.path) + + # Extract global, weighted-average LCOE cost for all energy sources. + tb_costs_global = extract_global_cost_for_all_sources_from_excel_file( + excel_object=data, metadata=snap.to_table_metadata() + ) + + # Extract national LCOE for specific countries and technologies. + tb_costs_national = extract_country_cost_from_excel_file(excel_object=data, metadata=snap.to_table_metadata()) + + # Combine global and national data. + tb_combined = combine_global_and_national_data(tb_costs_global=tb_costs_global, tb_costs_national=tb_costs_national) + + # Extract global data on solar photovoltaic module prices. + tb_solar_pv_prices = prepare_solar_pv_module_prices(excel_object=data, metadata=snap.to_table_metadata()) + + # + # Save outputs. + # + # Create a new Meadow dataset and reuse walden metadata. + ds = create_dataset( + dest_dir=dest_dir, + tables=[tb_combined, tb_solar_pv_prices], + default_metadata=snap.metadata, + check_variables_metadata=True, + ) + ds.save() diff --git a/snapshots/irena/2023-06-26/renewable_power_generation_costs.py b/snapshots/irena/2023-06-26/renewable_power_generation_costs.py new file mode 100644 index 00000000000..949725b8647 --- /dev/null +++ b/snapshots/irena/2023-06-26/renewable_power_generation_costs.py @@ -0,0 +1,32 @@ +"""Script to create a snapshot of dataset 'Renewable Power Generation Costs'.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"irena/{SNAPSHOT_VERSION}/renewable_power_generation_costs.xlsx") + + # Download data from source. + snap.download_from_source() + + # Add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/irena/2023-06-26/renewable_power_generation_costs.xlsx.dvc b/snapshots/irena/2023-06-26/renewable_power_generation_costs.xlsx.dvc new file mode 100644 index 00000000000..2cae279d62f --- /dev/null +++ b/snapshots/irena/2023-06-26/renewable_power_generation_costs.xlsx.dvc @@ -0,0 +1,21 @@ +meta: + name: Renewable Power Generation Costs + publication_year: 2022 + publication_date: '2022-07-01' + source_name: International Renewable Energy Agency (IRENA) + source_published_by: International Renewable Energy Agency (IRENA) + url: https://irena.org/publications/2022/Jul/Renewable-Power-Generation-Costs-in-2021 + source_data_url: + https://www.irena.org/-/media/Files/IRENA/Agency/Publication/2022/Jul/IRENA-Datafile-RenPwrGenCosts-in-2021-v1-0.xlsx + license_url: + https://irena.org/-/media/Files/IRENA/Agency/Publication/2022/Jul/IRENA_Renewable_Power_Generation_Costs_2021.pdf + license_name: Copyright IRENA 2021 + date_accessed: 2023-07-19 + is_public: true + description: | + +wdir: ../../../data/snapshots/irena/2023-06-26 +outs: +- md5: 56e7a24b2e61da1fc5bc36262ff7a5bf + size: 675885 + path: renewable_power_generation_costs.xlsx From 1a0a044d915b84026c40110819b69296b075de5b Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 16:23:09 +0200 Subject: [PATCH 09/16] Adapt photovoltaic cost and capacity step and update dag files --- dag/archive/energy.yml | 9 +++ dag/archive/main.yml | 2 +- dag/archive/papers.yml | 17 ++++++ dag/energy.yml | 18 +++--- dag/papers.yml | 20 ++++--- .../photovoltaic_cost_and_capacity.py | 55 +++++++++++++------ 6 files changed, 86 insertions(+), 35 deletions(-) create mode 100644 dag/archive/papers.yml diff --git a/dag/archive/energy.yml b/dag/archive/energy.yml index 7cad741be07..a407e91f02b 100644 --- a/dag/archive/energy.yml +++ b/dag/archive/energy.yml @@ -266,6 +266,15 @@ steps: - data://garden/ggdc/2020-10-01/ggdc_maddison - data://garden/regions/2023-01-01/regions # + # IRENA - Renewable power generation costs (2022). + # + data://meadow/irena/2023-01-04/renewable_power_generation_costs: + - walden://irena/2022-10-07/renewable_power_generation_costs + data://garden/irena/2023-01-04/renewable_power_generation_costs: + - data://meadow/irena/2023-01-04/renewable_power_generation_costs + data://grapher/irena/2023-01-04/renewable_power_generation_costs: + - data://garden/irena/2023-01-04/renewable_power_generation_costs + # # IRENA - Renewable electricity capacity (and generation, although the generation part is ignored for now, 2022). # data://meadow/irena/2023-01-04/renewable_electricity_capacity_and_generation: diff --git a/dag/archive/main.yml b/dag/archive/main.yml index df2587d641c..39a2d0e278e 100644 --- a/dag/archive/main.yml +++ b/dag/archive/main.yml @@ -78,7 +78,7 @@ include: # - dag/archive/walkthrough.yml # - dag/archive/examples.yml # - dag/archive/emissions.yml - # - dag/archive/papers.yml + - dag/archive/papers.yml - dag/archive/demography.yml # - dag/archive/war.yml - dag/archive/fasttrack.yml diff --git a/dag/archive/papers.yml b/dag/archive/papers.yml new file mode 100644 index 00000000000..ff12f1766c8 --- /dev/null +++ b/dag/archive/papers.yml @@ -0,0 +1,17 @@ +steps: + # + # Farmer & Lafond (2016). + # + data://meadow/papers/2023-01-04/farmer_lafond_2016: + - snapshot://papers/2023-01-04/farmer_lafond_2016.csv + data://garden/papers/2023-01-04/farmer_lafond_2016: + - data://meadow/papers/2023-01-04/farmer_lafond_2016 + data://grapher/papers/2023-01-04/farmer_lafond_2016: + - data://garden/papers/2023-01-04/farmer_lafond_2016 + # + # Nemet (2009). + # + data://meadow/papers/2023-01-04/nemet_2009: + - snapshot://papers/2023-01-04/nemet_2009.csv + data://garden/papers/2023-01-04/nemet_2009: + - data://meadow/papers/2023-01-04/nemet_2009 diff --git a/dag/energy.yml b/dag/energy.yml index e428c68ae36..77eb0bf7978 100644 --- a/dag/energy.yml +++ b/dag/energy.yml @@ -19,12 +19,12 @@ steps: # # IRENA - Renewable power generation costs (2022). # - data://meadow/irena/2023-01-04/renewable_power_generation_costs: - - walden://irena/2022-10-07/renewable_power_generation_costs - data://garden/irena/2023-01-04/renewable_power_generation_costs: - - data://meadow/irena/2023-01-04/renewable_power_generation_costs - data://grapher/irena/2023-01-04/renewable_power_generation_costs: - - data://garden/irena/2023-01-04/renewable_power_generation_costs + data://meadow/irena/2023-06-26/renewable_power_generation_costs: + - snapshot://irena/2023-06-26/renewable_power_generation_costs.xlsx + data://garden/irena/2023-06-26/renewable_power_generation_costs: + - data://meadow/irena/2023-06-26/renewable_power_generation_costs + data://grapher/irena/2023-06-26/renewable_power_generation_costs: + - data://garden/irena/2023-06-26/renewable_power_generation_costs # # IRENA - Renewable electricity capacity (2023). # @@ -66,10 +66,10 @@ steps: # Energy - Photovoltaic cost and capacity. # data://garden/energy/2023-07-10/photovoltaic_cost_and_capacity: - - data://garden/papers/2023-01-04/nemet_2009 - - data://garden/papers/2023-01-04/farmer_lafond_2016 + - data://garden/papers/2023-07-10/nemet_2009 + - data://garden/papers/2023-07-10/farmer_lafond_2016 - data://garden/irena/2023-06-26/renewable_electricity_capacity - - data://garden/irena/2023-01-04/renewable_power_generation_costs + - data://garden/irena/2023-06-26/renewable_power_generation_costs data://grapher/energy/2023-07-10/photovoltaic_cost_and_capacity: - data://garden/energy/2023-07-10/photovoltaic_cost_and_capacity # diff --git a/dag/papers.yml b/dag/papers.yml index e0561c96c1f..ab59439c979 100644 --- a/dag/papers.yml +++ b/dag/papers.yml @@ -2,19 +2,19 @@ steps: # # Farmer & Lafond (2016). # - data://meadow/papers/2023-01-04/farmer_lafond_2016: + data://meadow/papers/2023-07-10/farmer_lafond_2016: - snapshot://papers/2023-01-04/farmer_lafond_2016.csv - data://garden/papers/2023-01-04/farmer_lafond_2016: - - data://meadow/papers/2023-01-04/farmer_lafond_2016 - data://grapher/papers/2023-01-04/farmer_lafond_2016: - - data://garden/papers/2023-01-04/farmer_lafond_2016 + data://garden/papers/2023-07-10/farmer_lafond_2016: + - data://meadow/papers/2023-07-10/farmer_lafond_2016 + data://grapher/papers/2023-07-10/farmer_lafond_2016: + - data://garden/papers/2023-07-10/farmer_lafond_2016 # # Nemet (2009). # - data://meadow/papers/2023-01-04/nemet_2009: + data://meadow/papers/2023-07-10/nemet_2009: - snapshot://papers/2023-01-04/nemet_2009.csv - data://garden/papers/2023-01-04/nemet_2009: - - data://meadow/papers/2023-01-04/nemet_2009 + data://garden/papers/2023-07-10/nemet_2009: + - data://meadow/papers/2023-07-10/nemet_2009 # # Bayliss-Smith & Wanmali (1984). Data on long-term wheat yields. # @@ -61,3 +61,7 @@ steps: - snapshot://papers/2023-05-26/mueller_et_al_2012.xls data://garden/papers/2023-05-26/mueller_et_al_2012: - data://meadow/papers/2023-05-26/mueller_et_al_2012 + ###################################################################################################################### + # Older versions to be archived once they are not used by any other steps. + + ###################################################################################################################### diff --git a/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.py b/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.py index 2e8e91b0c25..7aa89c4c711 100644 --- a/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.py +++ b/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.py @@ -16,8 +16,12 @@ """ -import pandas as pd -from owid import catalog +import owid.catalog.processing as pr +from owid.catalog import Dataset, Table +from owid.catalog.tables import ( + get_unique_licenses_from_tables, + get_unique_sources_from_tables, +) from owid.datautils.dataframes import combine_two_overlapping_dataframes from etl.helpers import PathFinder, create_dataset @@ -32,7 +36,7 @@ USD2013_TO_USD2021 = 1.19 -def prepare_capacity_data(tb_nemet: catalog.Table, tb_irena_capacity: catalog.Table) -> catalog.Table: +def prepare_capacity_data(tb_nemet: Table, tb_irena_capacity: Table) -> Table: # Column "previous_capacity" is equivalent to tb_nemet["yearly_capacity"].shift(1).cumsum() # As they explain in the paper, "Following Epple et al. (1991), cumulative capacity is lagged one year to account # for the time it takes to incorporate new techniques obtained as a result of learning from experience." @@ -58,13 +62,19 @@ def prepare_capacity_data(tb_nemet: catalog.Table, tb_irena_capacity: catalog.Ta .sort_values("year") .reset_index(drop=True) ) + # NOTE: The previous operation does not propagate metadata. Manually combine sources. + for column in ["cumulative_capacity", "cumulative_capacity_source"]: + cumulative_capacity[column].metadata.sources = get_unique_sources_from_tables( + [tb_nemet_capacity, tb_irena_capacity] + ) + cumulative_capacity[column].metadata.licenses = get_unique_licenses_from_tables( + [tb_nemet_capacity, tb_irena_capacity] + ) return cumulative_capacity -def prepare_cost_data( - tb_nemet: catalog.Table, tb_irena_cost: catalog.Table, tb_farmer_lafond: catalog.Table -) -> catalog.Table: +def prepare_cost_data(tb_nemet: Table, tb_irena_cost: Table, tb_farmer_lafond: Table) -> Table: # Prepare solar photovoltaic cost data from Nemet (2009). tb_nemet_cost = tb_nemet[["year", "cost"]].copy() tb_nemet_cost["cost_source"] = "Nemet (2009)" @@ -94,6 +104,15 @@ def prepare_cost_data( # Combine the previous with IRENA, prioritizing the latter. combined = combine_two_overlapping_dataframes(df1=tb_irena_cost, df2=combined, index_columns="year") + # NOTE: The previous operation does not propagate metadata. Manually combine sources. + for column in ["cost", "cost_source"]: + combined[column].metadata.sources = get_unique_sources_from_tables( + [tb_nemet_cost, tb_farmer_lafond, tb_irena_cost] + ) + combined[column].metadata.licenses = get_unique_licenses_from_tables( + [tb_nemet_cost, tb_farmer_lafond, tb_irena_cost] + ) + return combined @@ -101,21 +120,21 @@ def run(dest_dir: str) -> None: # # Load data. # - # Load Nemet (2009) dataset from Garden. - ds_nemet: catalog.Dataset = paths.load_dependency("nemet_2009") + # Load Nemet (2009) dataset from garden and read its main table. + ds_nemet: Dataset = paths.load_dependency("nemet_2009") tb_nemet = ds_nemet["nemet_2009"].reset_index() - # Load Farmer & Lafond (2016) dataset from Garden. - ds_farmer_lafond: catalog.Dataset = paths.load_dependency("farmer_lafond_2016") + # Load Farmer & Lafond (2016) dataset from garden and read its main table. + ds_farmer_lafond: Dataset = paths.load_dependency("farmer_lafond_2016") tb_farmer_lafond = ds_farmer_lafond["farmer_lafond_2016"].reset_index() - # Load IRENA dataset on capacity from Garden. - ds_irena_capacity: catalog.Dataset = paths.load_dependency("renewable_electricity_capacity") + # Load IRENA dataset on capacity from garden and read its main table. + ds_irena_capacity: Dataset = paths.load_dependency("renewable_electricity_capacity") tb_irena_capacity = ds_irena_capacity["renewable_electricity_capacity"].reset_index() - # Load IRENA dataset on cost from Garden. - ds_irena_cost: catalog.Dataset = paths.load_dependency("renewable_power_generation_costs") - tb_irena_cost = ds_irena_cost["solar_photovoltaic_module_prices"] + # Load IRENA dataset on cost from garden and read its main table. + ds_irena_cost: Dataset = paths.load_dependency("renewable_power_generation_costs") + tb_irena_cost = ds_irena_cost["solar_photovoltaic_module_prices"].reset_index() # # Process data. @@ -127,7 +146,7 @@ def run(dest_dir: str) -> None: cost = prepare_cost_data(tb_nemet=tb_nemet, tb_irena_cost=tb_irena_cost, tb_farmer_lafond=tb_farmer_lafond) # Combine capacity and cost data. - tb_combined = pd.merge(cost, cumulative_capacity, on="year", how="outer") + tb_combined = pr.merge(cost, cumulative_capacity, on="year", how="outer") # Add column for region. tb_combined = tb_combined.assign(**{"country": "World"}) @@ -135,10 +154,12 @@ def run(dest_dir: str) -> None: # Set an appropriate index and sort conveniently. tb_combined = tb_combined.set_index(["country", "year"], verify_integrity=True).sort_index() + # Rename table. + tb_combined.metadata.short_name = paths.short_name + # # Save outputs. # # Create a new dataset with the same metadata as meadow - tb_combined.metadata.short_name = paths.short_name ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_combined]) ds_garden.save() From a9efa3d6a841849fe3bdb347d05d1e4a3fa0a1f2 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 16:25:08 +0200 Subject: [PATCH 10/16] fix(irena): Fix bug --- .../irena/2023-06-26/renewable_power_generation_costs.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.py b/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.py index cd66d5e69e1..93892427ae4 100644 --- a/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.py +++ b/etl/steps/data/garden/irena/2023-06-26/renewable_power_generation_costs.py @@ -24,9 +24,7 @@ def run(dest_dir: str) -> None: # Process data. # # Harmonize country names. - tb: Table = geo.harmonize_countries( - df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path - ) + tb: Table = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) # Set an appropriate index and sort conveniently. tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index() From a5c4fad09b132a9bb78cdeee74b66e306fe8dc92 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 16:36:10 +0200 Subject: [PATCH 11/16] Improve metadata --- .../renewable_electricity_capacity_and_generation.xlsm.dvc | 6 +++--- .../2023-06-26/renewable_power_generation_costs.xlsx.dvc | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/snapshots/irena/2023-06-26/renewable_electricity_capacity_and_generation.xlsm.dvc b/snapshots/irena/2023-06-26/renewable_electricity_capacity_and_generation.xlsm.dvc index ef451eb66e1..2082927b8af 100644 --- a/snapshots/irena/2023-06-26/renewable_electricity_capacity_and_generation.xlsm.dvc +++ b/snapshots/irena/2023-06-26/renewable_electricity_capacity_and_generation.xlsm.dvc @@ -4,11 +4,11 @@ meta: publication_date: '2023-03-01' source_name: International Renewable Energy Agency (IRENA) - source_published_by: International Renewable Energy Agency (IRENA) + source_published_by: International Renewable Energy Agency © IRENA 2022 url: https://www.irena.org/Statistics/Download-query-tools - source_data_url: + source_data_url: https://www.irena.org/IRENADocuments/IRENA_RE_electricity_statistics_-_Query_tool.xlsm - license_url: + license_url: https://www.irena.org/-/media/Files/IRENA/Agency/Publication/2023/Mar/IRENA_RE_Capacity_Statistics_2023.pdf license_name: Copyright © IRENA 2023 date_accessed: 2023-06-28 diff --git a/snapshots/irena/2023-06-26/renewable_power_generation_costs.xlsx.dvc b/snapshots/irena/2023-06-26/renewable_power_generation_costs.xlsx.dvc index 2cae279d62f..1d89c6e7b6e 100644 --- a/snapshots/irena/2023-06-26/renewable_power_generation_costs.xlsx.dvc +++ b/snapshots/irena/2023-06-26/renewable_power_generation_costs.xlsx.dvc @@ -3,7 +3,7 @@ meta: publication_year: 2022 publication_date: '2022-07-01' source_name: International Renewable Energy Agency (IRENA) - source_published_by: International Renewable Energy Agency (IRENA) + source_published_by: International Renewable Energy Agency © IRENA 2022 url: https://irena.org/publications/2022/Jul/Renewable-Power-Generation-Costs-in-2021 source_data_url: https://www.irena.org/-/media/Files/IRENA/Agency/Publication/2022/Jul/IRENA-Datafile-RenPwrGenCosts-in-2021-v1-0.xlsx From 232762c493b9d1ecdd2ddb2c759c2ae5c5005066 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 16:37:16 +0200 Subject: [PATCH 12/16] Delete unnecessary walden ingest code --- .../renewable_power_generation_costs.meta.yml | 14 -------- .../renewable_power_generation_costs.py | 33 ------------------- 2 files changed, 47 deletions(-) delete mode 100644 lib/walden/ingests/irena/2022-10-07/renewable_power_generation_costs.meta.yml delete mode 100644 lib/walden/ingests/irena/2022-10-07/renewable_power_generation_costs.py diff --git a/lib/walden/ingests/irena/2022-10-07/renewable_power_generation_costs.meta.yml b/lib/walden/ingests/irena/2022-10-07/renewable_power_generation_costs.meta.yml deleted file mode 100644 index 4f9e5909ed9..00000000000 --- a/lib/walden/ingests/irena/2022-10-07/renewable_power_generation_costs.meta.yml +++ /dev/null @@ -1,14 +0,0 @@ -namespace: irena -short_name: renewable_power_generation_costs -name: Renewable Power Generation Costs -source_name: International Renewable Energy Agency (IRENA) -publication_year: 2022 -publication_date: 2022-07-01 -version: 2022-10-07 -url: https://irena.org/publications/2022/Jul/Renewable-Power-Generation-Costs-in-2021 -source_data_url: https://www.irena.org/-/media/Files/IRENA/Agency/Publication/2022/Jul/IRENA-Datafile-RenPwrGenCosts-in-2021-v1-0.xlsx -file_extension: xlsx -license_url: https://irena.org/-/media/Files/IRENA/Agency/Publication/2022/Jul/IRENA_Renewable_Power_Generation_Costs_2021.pdf -license_name: Copyright IRENA 2021 -description: | - The data in this link may be freely used, shared, copied, reproduced, printed and/or stored, provided that all such material is clearly attributed to IRENA and bears a notation of copyright (© IRENA) with the year of copyright. Unless otherwise stated, the data provided are the property of the International Renewable Energy Agency (IRENA) and are subject to copyright by IRENA. Data attributed to third parties may be subject to third-party copyright and separate terms of use and restrictions, including restrictions in relation to any commercial use. diff --git a/lib/walden/ingests/irena/2022-10-07/renewable_power_generation_costs.py b/lib/walden/ingests/irena/2022-10-07/renewable_power_generation_costs.py deleted file mode 100644 index b823dfc0c3c..00000000000 --- a/lib/walden/ingests/irena/2022-10-07/renewable_power_generation_costs.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Ingest IRENA's chart data from the Renewable Power Generation Costs.""" - -from pathlib import Path - -import click - -from owid.walden import Dataset - - -@click.command() -@click.option( - "--upload/--skip-upload", - default=True, - type=bool, - help="Upload dataset to Walden", -) -def main(upload: bool) -> None: - # Get metadata about this dataset from the adjacent yaml file. - metadata = Dataset.from_yaml(Path(__file__).parent / "renewable_power_generation_costs.meta.yml") - - # Download dataset from source_data_url and add the local file to walden's cache in: ~/.owid/walden - dataset = Dataset.download_and_create(metadata) - - # Upload file to S3. - if upload: - dataset.upload(public=True) - - # Create a walden index file. - dataset.save() - - -if __name__ == "__main__": - main() From d6ddfcac832d4eda3d18a39561da9522fa05c9cb Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 16:48:12 +0200 Subject: [PATCH 13/16] Improve metadata --- .../photovoltaic_cost_and_capacity.meta.yml | 69 ++++--------------- .../photovoltaic_cost_and_capacity.py | 4 +- 2 files changed, 15 insertions(+), 58 deletions(-) diff --git a/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.meta.yml b/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.meta.yml index 43e3f753454..953e91cdde3 100644 --- a/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.meta.yml +++ b/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.meta.yml @@ -1,57 +1,19 @@ -all_sources: - - nemet_2009: &source-nemet_2009 - name: G. G. Nemet (2009) - published_by: | - Interim monitoring of cost dynamics for publicly supported energy technologies. Energy Policy 37(3): 825-835. by Nemet, G. F. (2009). - url: https://www.sciencedirect.com/science/article/abs/pii/S0301421508005910 - date_accessed: '2023-01-04' - publication_date: '2009-03-01' - publication_year: 2009 - description: | - Photovoltaic cost and capacity data between 1975 and 2003 has been taken from Nemet (2009). +dataset: + title: Solar photovoltaic cost and capacity (Energy, 2023b) + description: | + Photovoltaic cost and capacity data between 1975 and 2003 has been taken from Nemet (2009). - Prices from Nemet (2009) have been converted to 2021 US$ using the US GDP deflator: https://www.multpl.com/gdp-deflator/table/by-year - - farmer_lafond_2016: &source-farmer_lafond_2016 - name: J. D. Farmer & F. Lafond (2016) - published_by: | - How predictable is technological progress? J. D. Farmer & F. Lafond, Research Policy Volume 45, Issue 3, April 2016, Pages 647-665. - url: https://www.sciencedirect.com/science/article/pii/S0048733315001699 - date_accessed: '2023-01-04' - publication_date: '2016-04-01' - publication_year: 2016 - description: | - Photovoltaic cost data between 2004 and 2009 has been taken from Farmer & Lafond (2016). + Prices from Nemet (2009) have been converted to 2021 US$ using the US GDP deflator: https://www.multpl.com/gdp-deflator/table/by-year - According to Farmer & Lafond (2016), the data are mostly taken from the Santa-Fe Performance Curve DataBase, accessible at pcdb.santafe.edu. The database has been constructed from personal communications and from Colpier and Cornland (2002), Goldemberg et al. (2004), Lieberman (1984), Lipman and Sperling (1999), Zhao (1999), McDonald and Schrattenholzer (2001), Neij et al. (2003), Moore (2006), Nemet (2006), Schilling and Esmundo (2009). The data on photovoltaic prices has been collected from public releases of Strategies Unlimited, Navigant and SPV Market Research. The data on nuclear energy is from Koomey and Hultman (2007) and Cooper (2009). The DNA sequencing data is from Wetterstrand (2015) (cost per human-size genome), and for each year the last available month (September for 2001-2002 and October afterwards) was taken and corrected for inflation using the US GDP deflator. + Photovoltaic cost data between 2004 and 2009 has been taken from Farmer & Lafond (2016). - Prices from Farmer & Lafond (2016) have been converted to 2021 US$ using the US GDP deflator: https://www.multpl.com/gdp-deflator/table/by-year - - irena_capacity: &source-irena_capacity - name: International Renewable Energy Agency (IRENA) - published_by: © 2022 by International Renewable Energy Agency (IRENA) - url: https://www.irena.org/Statistics/Download-query-tools - date_accessed: '2022-10-20' - publication_date: '2022-07-01' - publication_year: 2022 - description: | - Photovoltaic capacity data between 2004 and 2021 has been taken from IRENA. - - irena_costs: &source-irena_costs - name: International Renewable Energy Agency (IRENA) - published_by: International Renewable Energy Agency (IRENA) © 2022 by IRENA - url: https://irena.org/publications/2022/Jul/Renewable-Power-Generation-Costs-in-2021 - date_accessed: '2022-10-20' - publication_year: 2022 - description: | - Photovoltaic cost data between 2010 and 2021 has been taken from IRENA. + According to Farmer & Lafond (2016), the data are mostly taken from the Santa-Fe Performance Curve DataBase, accessible at pcdb.santafe.edu. The database has been constructed from personal communications and from Colpier and Cornland (2002), Goldemberg et al. (2004), Lieberman (1984), Lipman and Sperling (1999), Zhao (1999), McDonald and Schrattenholzer (2001), Neij et al. (2003), Moore (2006), Nemet (2006), Schilling and Esmundo (2009). The data on photovoltaic prices has been collected from public releases of Strategies Unlimited, Navigant and SPV Market Research. The data on nuclear energy is from Koomey and Hultman (2007) and Cooper (2009). The DNA sequencing data is from Wetterstrand (2015) (cost per human-size genome), and for each year the last available month (September for 2001-2002 and October afterwards) was taken and corrected for inflation using the US GDP deflator. -dataset: - title: Solar photovoltaic cost and capacity (Energy, 2023b) - description: | - Prices from Nemet (2009) and from Farmer & LaFond (2016) have been converted to 2021 US$ using the US GDP deflator: https://www.multpl.com/gdp-deflator/table/by-year - sources: - - *source-nemet_2009 - - *source-farmer_lafond_2016 - - *source-irena_capacity - - *source-irena_costs + Prices from Farmer & Lafond (2016) have been converted to 2021 US$ using the US GDP deflator: https://www.multpl.com/gdp-deflator/table/by-year + + Photovoltaic capacity data between 2004 and 2021 has been taken from IRENA. + + Photovoltaic cost data between 2010 and 2021 has been taken from IRENA. tables: photovoltaic_cost_and_capacity: @@ -64,10 +26,6 @@ tables: Global average price of solar photovoltaic modules. IRENA presents solar PV module price series for a number of different module technologies. Here we have adopted the series for thin film a-Si/u-Si or Global Index (from Q4 2013). - sources: - - *source-nemet_2009 - - *source-farmer_lafond_2016 - - *source-irena_costs cost_source: title: Data source for cost data unit: '' @@ -78,9 +36,6 @@ tables: Global cumulative capacity of solar photovoltaics. short_unit: 'MW' unit: 'megawatts' - sources: - - *source-nemet_2009 - - *source-irena_capacity cumulative_capacity_source: title: Data source for cumulative capacity data unit: '' diff --git a/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.py b/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.py index 7aa89c4c711..c326396d007 100644 --- a/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.py +++ b/etl/steps/data/garden/energy/2023-07-10/photovoltaic_cost_and_capacity.py @@ -161,5 +161,7 @@ def run(dest_dir: str) -> None: # Save outputs. # # Create a new dataset with the same metadata as meadow - ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_combined]) + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_combined], check_variables_metadata=True) + # NOTE: Currently, ETL fails if the dataset has no sources. Therefore, manually gather sources from all variables. + ds_garden.metadata.sources = get_unique_sources_from_tables([tb_combined]) ds_garden.save() From d68bf2f573ca3887651320d9db173079a1c18848 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Jul 2023 17:00:09 +0200 Subject: [PATCH 14/16] Fix missing step in dag --- dag/archive/main.yml | 2 +- dag/main.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dag/archive/main.yml b/dag/archive/main.yml index 39a2d0e278e..b9d921885de 100644 --- a/dag/archive/main.yml +++ b/dag/archive/main.yml @@ -77,7 +77,7 @@ include: - dag/archive/energy.yml # - dag/archive/walkthrough.yml # - dag/archive/examples.yml - # - dag/archive/emissions.yml + - dag/archive/emissions.yml - dag/archive/papers.yml - dag/archive/demography.yml # - dag/archive/war.yml diff --git a/dag/main.yml b/dag/main.yml index 619e5a72c10..635481d0c29 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -213,7 +213,7 @@ steps: - data://garden/demography/2023-02-03/life_expectancy - data://garden/owid/latest/key_indicators - data://garden/regions/2023-01-01/regions - - data://garden/gcp/2023-04-28/global_carbon_budget + - data://garden/gcp/2023-07-10/global_carbon_budget - data://garden/democracy/2023-03-02/vdem - data://garden/energy/2023-07-10/energy_mix - data://garden/worldbank_wdi/2022-05-26/wdi From 80a198059c69726d964f23ff7a4a0f32d2b2973b Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 20 Jul 2023 12:44:30 +0200 Subject: [PATCH 15/16] Fix name of unused Statistical Review variable, for consistency --- .../2023-06-26/statistical_review_of_world_energy.meta.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/steps/data/garden/energy_institute/2023-06-26/statistical_review_of_world_energy.meta.yml b/etl/steps/data/garden/energy_institute/2023-06-26/statistical_review_of_world_energy.meta.yml index fabe17ece93..c176235af66 100644 --- a/etl/steps/data/garden/energy_institute/2023-06-26/statistical_review_of_world_energy.meta.yml +++ b/etl/steps/data/garden/energy_institute/2023-06-26/statistical_review_of_world_energy.meta.yml @@ -248,7 +248,7 @@ tables: unit: 'thousand tonnes' short_unit: 'kt' lithium_reserves_kt: - title: 'Lithium Reserves' + title: 'Lithium reserves - kt' unit: 'thousand tonnes' short_unit: 'kt' nuclear_consumption_equivalent_ej: From 42f216447a2d9102f2751b9d08cc8dd0247c84d1 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 20 Jul 2023 14:34:47 +0200 Subject: [PATCH 16/16] Implement small metadata improvements as suggsted by Fiona and Pablo A --- etl/steps/data/garden/emissions/2023-07-10/owid_co2.py | 4 ---- .../energy/2023-01-04/photovoltaic_cost_and_capacity.meta.yml | 2 +- .../data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml | 1 + .../data/garden/papers/2023-01-04/farmer_lafond_2016.meta.yml | 2 +- 4 files changed, 3 insertions(+), 6 deletions(-) diff --git a/etl/steps/data/garden/emissions/2023-07-10/owid_co2.py b/etl/steps/data/garden/emissions/2023-07-10/owid_co2.py index 52371f4a84d..72d0d831632 100644 --- a/etl/steps/data/garden/emissions/2023-07-10/owid_co2.py +++ b/etl/steps/data/garden/emissions/2023-07-10/owid_co2.py @@ -23,10 +23,6 @@ # Get paths and naming conventions for current step. paths = PathFinder(__file__) -# Details for dataset to export. -DATASET_SHORT_NAME = "owid_co2" -DATASET_TITLE = "CO2 dataset (OWID, 2022)" - # Conversion factor from tonnes to million tonnes. TONNES_TO_MILLION_TONNES = 1e-6 diff --git a/etl/steps/data/garden/energy/2023-01-04/photovoltaic_cost_and_capacity.meta.yml b/etl/steps/data/garden/energy/2023-01-04/photovoltaic_cost_and_capacity.meta.yml index 5a39f64da72..7da43f4504d 100644 --- a/etl/steps/data/garden/energy/2023-01-04/photovoltaic_cost_and_capacity.meta.yml +++ b/etl/steps/data/garden/energy/2023-01-04/photovoltaic_cost_and_capacity.meta.yml @@ -22,7 +22,7 @@ all_sources: description: | Photovoltaic cost data between 2004 and 2009 has been taken from Farmer & Lafond (2016). - According to Farmer & Lafond (2016), the data are mostly taken from the Santa-Fe Performance Curve DataBase, accessible at pcdb.santafe.edu. The database has been constructed from personal communications and from Colpier and Cornland (2002), Goldemberg et al. (2004), Lieberman (1984), Lipman and Sperling (1999), Zhao (1999), McDonald and Schrattenholzer (2001), Neij et al. (2003), Moore (2006), Nemet (2006), Schilling and Esmundo (2009). The data on photovoltaic prices has been collected from public releases of Strategies Unlimited, Navigant and SPV Market Research. The data on nuclear energy is from Koomey and Hultman (2007) and Cooper (2009). The DNA sequencing data is from Wetterstrand (2015) (cost per human-size genome), and for each year the last available month (September for 2001-2002 and October afterwards) was taken and corrected for inflation using the US GDP deflator. + According to Farmer & Lafond (2016), the data are mostly taken from the Santa-Fe Performance Curve Database. The database has been constructed from personal communications and from Colpier and Cornland (2002), Goldemberg et al. (2004), Lieberman (1984), Lipman and Sperling (1999), Zhao (1999), McDonald and Schrattenholzer (2001), Neij et al. (2003), Moore (2006), Nemet (2006), Schilling and Esmundo (2009). The data on photovoltaic prices has been collected from public releases of Strategies Unlimited, Navigant and SPV Market Research. The data on nuclear energy is from Koomey and Hultman (2007) and Cooper (2009). The DNA sequencing data is from Wetterstrand (2015) (cost per human-size genome), and for each year the last available month (September for 2001-2002 and October afterwards) was taken and corrected for inflation using the US GDP deflator. Prices from Farmer & Lafond (2016) have been converted to 2021 US$ using the US GDP deflator: https://www.multpl.com/gdp-deflator/table/by-year - irena_capacity: &source-irena_capacity diff --git a/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml index 1cfb103d802..9c70ad34f54 100644 --- a/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml +++ b/etl/steps/data/garden/gcp/2023-07-10/global_carbon_budget.meta.yml @@ -17,6 +17,7 @@ dataset: Data on global emissions has been converted by Our World in Data from tonnes of carbon to tonnes of carbon dioxide (CO₂) using a conversion factor of 3.664. Our World in Data have renamed bunker fuels as "International transport" for improved clarity, which includes emissions from international aviation and shipping. + Emissions from the Kuwaiti oil fires in 1991 have been included as part of Kuwait's emissions for that year. tables: diff --git a/etl/steps/data/garden/papers/2023-01-04/farmer_lafond_2016.meta.yml b/etl/steps/data/garden/papers/2023-01-04/farmer_lafond_2016.meta.yml index 0f861c6c5ed..a6083a834aa 100644 --- a/etl/steps/data/garden/papers/2023-01-04/farmer_lafond_2016.meta.yml +++ b/etl/steps/data/garden/papers/2023-01-04/farmer_lafond_2016.meta.yml @@ -71,7 +71,7 @@ dataset: + Vinyl chloride is measured in 1966 USD/lbs. + Wind turbine (Denmark) is measured in DKK/kW. - According to Farmer & Lafond (2016), the data are mostly taken from the Santa-Fe Performance Curve DataBase, accessible at pcdb.santafe.edu. The database has been constructed from personal communications and from Colpier and Cornland (2002), Goldemberg et al. (2004), Lieberman (1984), Lipman and Sperling (1999), Zhao (1999), McDonald and Schrattenholzer (2001), Neij et al. (2003), Moore (2006), Nemet (2006), Schilling and Esmundo (2009). The data on photovoltaic prices has been collected from public releases of Strategies Unlimited, Navigant and SPV Market Research. The data on nuclear energy is from Koomey and Hultman (2007) and Cooper (2009). The DNA sequencing data is from Wetterstrand (2015) (cost per human-size genome), and for each year the last available month (September for 2001-2002 and October afterwards) was taken and corrected for inflation using the US GDP deflator. + According to Farmer & Lafond (2016), the data are mostly taken from the Santa-Fe Performance Curve Database. The database has been constructed from personal communications and from Colpier and Cornland (2002), Goldemberg et al. (2004), Lieberman (1984), Lipman and Sperling (1999), Zhao (1999), McDonald and Schrattenholzer (2001), Neij et al. (2003), Moore (2006), Nemet (2006), Schilling and Esmundo (2009). The data on photovoltaic prices has been collected from public releases of Strategies Unlimited, Navigant and SPV Market Research. The data on nuclear energy is from Koomey and Hultman (2007) and Cooper (2009). The DNA sequencing data is from Wetterstrand (2015) (cost per human-size genome), and for each year the last available month (September for 2001-2002 and October afterwards) was taken and corrected for inflation using the US GDP deflator. licenses: - name: Creative Commons 4.0 url: https://www.sciencedirect.com/science/article/pii/S0048733315001699