diff --git a/dag/education.yml b/dag/education.yml index 781f94d2b82..8742970c935 100644 --- a/dag/education.yml +++ b/dag/education.yml @@ -119,3 +119,11 @@ steps: - snapshot://wb/2024-06-18/edstats_metadata.xls data://grapher/unesco/2024-06-25/education_sdgs: - data://garden/unesco/2024-06-25/education_sdgs + + # World Bank EdStats + data://meadow/wb/2024-11-04/edstats: + - snapshot://wb/2024-11-04/edstats.csv + data://garden/wb/2024-11-04/edstats: + - data://meadow/wb/2024-11-04/edstats + data://grapher/wb/2024-11-04/edstats: + - data://garden/wb/2024-11-04/edstats diff --git a/etl/steps/data/garden/wb/2024-11-04/edstats.countries.json b/etl/steps/data/garden/wb/2024-11-04/edstats.countries.json new file mode 100644 index 00000000000..1b7f234cefd --- /dev/null +++ b/etl/steps/data/garden/wb/2024-11-04/edstats.countries.json @@ -0,0 +1,222 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "American Samoa": "American Samoa", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas, The": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cayman Islands": "Cayman Islands", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Channel Islands": "Channel Islands", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Costa Rica": "Costa Rica", + "Cote d'Ivoire": "Cote d'Ivoire", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Curacao": "Curacao", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "Czechia": "Czechia", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Faroe Islands": "Faroe Islands", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Polynesia": "French Polynesia", + "Gabon": "Gabon", + "Gambia, The": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Gibraltar": "Gibraltar", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guam": "Guam", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hong Kong SAR, China": "Hong Kong", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Isle of Man": "Isle of Man", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kyrgyz Republic": "Kyrgyzstan", + "Lao PDR": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Macao SAR, China": "Macao", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Moldova": "Moldova", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Macedonia": "North Macedonia", + "Northern Mariana Islands": "Northern Mariana Islands", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", + "Slovak Republic": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela, RB": "Venezuela", + "Viet Nam": "Vietnam", + "Vietnam": "Vietnam", + "West Bank and Gaza": "Palestine", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Congo, Dem Rep": "Democratic Republic of Congo", + "Congo, Rep": "Congo", + "Egypt, Arab Rep": "Egypt", + "Iran, Islamic Rep": "Iran", + "Korea, Dem People\u2019s Rep": "North Korea", + "Korea, Rep": "South Korea", + "Micronesia, Fed Sts": "Micronesia (country)", + "St Kitts and Nevis": "Saint Kitts and Nevis", + "St Lucia": "Saint Lucia", + "St Martin (French part)": "Saint Martin (French part)", + "St Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Turkiye": "Turkey", + "Virgin Islands (US)": "United States Virgin Islands", + "Yemen, Rep": "Yemen" +} \ No newline at end of file diff --git a/etl/steps/data/garden/wb/2024-11-04/edstats.meta.yml b/etl/steps/data/garden/wb/2024-11-04/edstats.meta.yml new file mode 100644 index 00000000000..000557ef57c --- /dev/null +++ b/etl/steps/data/garden/wb/2024-11-04/edstats.meta.yml @@ -0,0 +1,55 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Education + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 364 + + +tables: + edstats: + variables: + value: + title: << indicator_name >> + + description_from_producer: |- + <% if source_note != 'nan' %> + << source_note >> + <%- endif -%> + unit: |- + <% if unit_measure == NUMBER %> + number + <% elsif unit_measure == SHARE %> + % + <% elsif unit_measure == USD %> + US dollars + <% elsif unit_measure == IX_0T1 %> + index + <% elsif unit_measure == USD_CONST %> + constant US dollars + <% elsif unit_measure == YR %> + years + <% elsif unit_measure == nan %> + '' + <%- endif -%> + short_unit: |- + <% if unit_measure == SHARE %> + $ + <% elsif unit_measure == USD %> + $ + <% elsif unit_measure == USD_CONST %> + constant $ + <%- endif -%> + + + + + + + diff --git a/etl/steps/data/garden/wb/2024-11-04/edstats.py b/etl/steps/data/garden/wb/2024-11-04/edstats.py new file mode 100644 index 00000000000..97e1f5d84ec --- /dev/null +++ b/etl/steps/data/garden/wb/2024-11-04/edstats.py @@ -0,0 +1,40 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("edstats") + + # Read table from meadow dataset. + tb = ds_meadow["edstats"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + ) + + tb = tb.format(["country", "year", "indicator_name", "source_note", "unit_measure"]) + print(tb.columns) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/wb/2024-11-04/edstats.py b/etl/steps/data/grapher/wb/2024-11-04/edstats.py new file mode 100644 index 00000000000..4ff01c79b9a --- /dev/null +++ b/etl/steps/data/grapher/wb/2024-11-04/edstats.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("edstats") + + # Read table from garden dataset. + tb = ds_garden["edstats"] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/wb/2024-11-04/edstats.py b/etl/steps/data/meadow/wb/2024-11-04/edstats.py new file mode 100644 index 00000000000..77bd5bce763 --- /dev/null +++ b/etl/steps/data/meadow/wb/2024-11-04/edstats.py @@ -0,0 +1,67 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("edstats.csv") + + # Load data from snapshot. + tb = snap.read(low_memory=False) + + # + # Process data. + # + # Remove redundant columns (SEX, URBANIZATION, AGE, COMP_BREAKDOWN_1, INDICATOR_ROOT, INDICATOR_ROOT_NAME, economy are already within the indicator_name) + columns_to_drop = [ + "unit", + "name", + "SEX", + "URBANIZATION", + "AGE", + "COMP_BREAKDOWN_1", + "INDICATOR_ROOT", + "INDICATOR_ROOT_NAME", + "economy", + "UNIT_TYPE", + "source", + "INDICATOR", + ] + + tb = tb.drop(columns=columns_to_drop) + + # Identify columns that start with 'YR' + year_columns = [col for col in tb.columns if col.startswith("YR")] + + # Melt the DataFrame to create a 'year' column + tb = tb.melt( + id_vars=[col for col in tb.columns if col not in year_columns], + value_vars=year_columns, + var_name="year", + value_name="value", + ) + + # Remove 'YR' prefix from the 'year' column + tb["year"] = tb["year"].str.replace("YR", "").astype(int) + + tb = tb.rename({"Country name": "country"}, axis=1) + # Drop rows where 'country' is NaN + tb = tb.dropna(subset=["country"]) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "indicator_name"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save()