From 7369b0c20f5edc1e4dc465d63fe30e113430718b Mon Sep 17 00:00:00 2001 From: veronikasamborska1994 Date: Mon, 24 Jul 2023 14:39:20 +0200 Subject: [PATCH] small AI fixes + CSET dataset on investment --- dag/artificial_intelligence.yml | 8 ++ dag/walkthrough.yml | 2 +- .../2023-06-14/ai_national_strategy.py | 2 +- .../2023-06-14/ai_robots.meta.yml | 12 +- .../2023-06-14/ai_robots.py | 1 + .../2023-06-26/ai_wrp_2021.meta.yml | 40 +++++- .../2023-06-26/ai_wrp_2021.py | 116 +++++++++++++----- .../2023-06-26/ai_wrp_2021_grouped.meta.yml | 44 ++++++- .../2023-06-26/ai_wrp_2021_grouped.py | 92 ++++++++------ .../2023-07-12/epoch_llms.meta.yml | 10 +- .../2023-07-12/epoch_llms.py | 24 ---- .../2023-07-23/cset.countries.json | 2 + .../2023-07-23/cset.excluded_countries.json | 2 + .../2023-07-23/cset.meta.yml | 35 ++++++ .../2023-07-23/cset.py | 38 ++++++ .../2023-07-23/cset.py | 39 ++++++ .../2023-07-23/cset.py | 38 ++++++ .../2023-07-12/epoch_llms.csv.dvc | 13 +- .../2023-07-23/cset.csv.dvc | 28 +++++ .../2023-07-23/cset.py | 60 +++++++++ 20 files changed, 492 insertions(+), 114 deletions(-) create mode 100644 etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.countries.json create mode 100644 etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.excluded_countries.json create mode 100644 etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.meta.yml create mode 100644 etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.py create mode 100644 etl/steps/data/grapher/artificial_intelligence/2023-07-23/cset.py create mode 100644 etl/steps/data/meadow/artificial_intelligence/2023-07-23/cset.py create mode 100644 snapshots/artificial_intelligence/2023-07-23/cset.csv.dvc create mode 100644 snapshots/artificial_intelligence/2023-07-23/cset.py diff --git a/dag/artificial_intelligence.yml b/dag/artificial_intelligence.yml index 930b933847b..3a8cb2a4fa5 100644 --- a/dag/artificial_intelligence.yml +++ b/dag/artificial_intelligence.yml @@ -205,3 +205,11 @@ steps: - data://meadow/artificial_intelligence/2023-07-07/semiconductors_cset data://grapher/artificial_intelligence/2023-07-07/semiconductors_cset: - data://garden/artificial_intelligence/2023-07-07/semiconductors_cset + +# CSET data on patents, articles and private investment + data://meadow/artificial_intelligence/2023-07-23/cset: + - snapshot://artificial_intelligence/2023-07-23/cset.csv + data://garden/artificial_intelligence/2023-07-23/cset: + - data://meadow/artificial_intelligence/2023-07-23/cset + data://grapher/artificial_intelligence/2023-07-23/cset: + - data://garden/artificial_intelligence/2023-07-23/cset diff --git a/dag/walkthrough.yml b/dag/walkthrough.yml index 75d8204ec07..413e39c0dbe 100644 --- a/dag/walkthrough.yml +++ b/dag/walkthrough.yml @@ -7,4 +7,4 @@ steps: data://grapher/dummy/2020-01-01/dummy: - data://garden/dummy/2020-01-01/dummy data://explorers/dummy/2020-01-01/dummy: - - data://garden/dummy/2020-01-01/dummy \ No newline at end of file + - data://garden/dummy/2020-01-01/dummy diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py index baebc788785..9fa62b488c5 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py @@ -61,7 +61,7 @@ def run(dest_dir: str) -> None: group["released_national_strategy_on_ai"].fillna(method="ffill", inplace=True) # Fill remaining NaN values with "Not Released" - group["released_national_strategy_on_ai"].fillna("Not Released", inplace=True) + group["released_national_strategy_on_ai"].fillna("Not released", inplace=True) df_merged.loc[group.index] = group df_merged.drop("released", axis=1, inplace=True) tb = Table(df_merged, short_name=paths.short_name, underscore=True) diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.meta.yml b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.meta.yml index ee27a599c35..61bf72e4287 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.meta.yml +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.meta.yml @@ -184,6 +184,7 @@ tables: unit: 'robots' description: "Industrial robots are defined as “automatically controlled, reprogrammable, multipurpose manipulator, programmable in three or more axes, which can be either fixed in place or mobile for use in industrial automation applications." display: + name: Total industrial robots in operation numDecimalPlaces: 0 number_of_industrial_robots_installed_2021: @@ -191,7 +192,7 @@ tables: unit: 'robots' description: "Industrial robots are defined as “automatically controlled, reprogrammable, multipurpose manipulator, programmable in three or more axes, which can be either fixed in place or mobile for use in industrial automation applications." display: - name: Number of industrial robots installed + name: Total of industrial robots installed numDecimalPlaces: 0 annual_count__number_of_industrial_robots_installed: @@ -199,7 +200,7 @@ tables: unit: 'robots' description: "Industrial robots are defined as “automatically controlled, reprogrammable, multipurpose manipulator, programmable in three or more axes, which can be either fixed in place or mobile for use in industrial automation applications." display: - name: Number of industrial robots installed + name: Annual industrial robots installed numDecimalPlaces: 0 new_robots_installed__number_of_industrial_robots_installed: @@ -208,4 +209,11 @@ tables: description: "Industrial robots are defined as “automatically controlled, reprogrammable, multipurpose manipulator, programmable in three or more axes, which can be either fixed in place or mobile for use in industrial automation applications." display: name: Number of industrial robots installed + numDecimalPlaces: 0 + unspecified_others: + title: Unspecified or Other Sector + unit: 'robots' + description: "Industrial robots are defined as “automatically controlled, reprogrammable, multipurpose manipulator, programmable in three or more axes, which can be either fixed in place or mobile for use in industrial automation applications." + display: + name: Unspecified or other numDecimalPlaces: 0 \ No newline at end of file diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.py b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.py index d09cb646c39..725b4e62e5c 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.py @@ -101,6 +101,7 @@ def run(dest_dir: str) -> None: # Merge pivot table for professional service robots, application area and sector with aggregates merge_all = pd.merge(merge_service, df_agg_clean, on=["year", "country"], how="outer") + merge_all["unspecified_others"] = merge_all["Unspecified Sector"] + merge_all["All others"] # Set the index as 'country' and 'year' merge_all.set_index(["country", "year"], inplace=True) diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.meta.yml b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.meta.yml index 38694c3ec5a..636f25e83fb 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.meta.yml +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.meta.yml @@ -66,9 +66,18 @@ tables: display: numDecimalPlaces: 0 + dk_no_op: + title: "Don't have an opinion, Don't know help/harm question" + description: Share of respondents said "Don't have an opinion" or "Don't know" to the question "Will Artificial Intelligence help or harm people in the next 20 years?". + unit: '%' + short_unit: '%' + display: + name: "No opinion or don't know" + numDecimalPlaces: 0 + other_help_harm: - title: "Don't have an opinion, Don't know, or Refused - help/harm question" - description: Share of respondents who refused to answer the question or said "Don't have an opinion" to the question "Will Artificial Intelligence help or harm people in the next 20 years?". + title: "Don't have an opinion, Don't know help/harm question or refused" + description: Share of respondents said "Don't have an opinion", "Don't know" or refused the question "Will Artificial Intelligence help or harm people in the next 20 years?". unit: '%' short_unit: '%' display: @@ -83,3 +92,30 @@ tables: display: name: "Other" numDecimalPlaces: 0 + + refused__help_harm: + title: "Refused - help/harm question" + description: Share of respondents who refused to answer the question "Will Artificial Intelligence help or harm people in the next 20 years?". + unit: '%' + short_unit: '%' + display: + name: "No response" + numDecimalPlaces: 0 + + dk__cars: + title: "Don't know - self-driving cars question" + description: Share of respondents who refused to answer the question or said "Don't know" to the question "Would you feel safe in a car driven by a computer without a human driver?". + unit: '%' + short_unit: '%' + display: + name: "Don't know" + numDecimalPlaces: 0 + + refused__cars: + title: "Refused - self-driving cars question" + description: Share of respondents who refused to answer the question or said "Don't know" to the question "Would you feel safe in a car driven by a computer without a human driver?". + unit: '%' + short_unit: '%' + display: + name: "No response" + numDecimalPlaces: 0 \ No newline at end of file diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py index 30d5c95a33b..bfe7cd4e1c7 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py @@ -2,6 +2,7 @@ from typing import cast +import numpy as np import pandas as pd from owid.catalog import Dataset, Table from structlog import get_logger @@ -57,10 +58,17 @@ def run(dest_dir: str) -> None: columns_to_split_by = ["country", "gender", "education", "income_5", "emp_2010", "agegroups4", "globalregion"] # Dictionary to map response codes to labels for question 9 - dict_q9 = {1: "Mostly help", 2: "Mostly harm", 3: "Don't have an opinion", 4: "Neither", 98: "DK", 99: "Refused"} + dict_q9 = { + 1: "Mostly help", + 2: "Mostly harm", + 3: "Don't have an opinion", + 4: "Neither", + 98: "DK(help/harm)", + 99: "Refused(help/harm)", + } # Dictionary to map response codes to labels for question 8 - dict_q8 = {1: "Yes, would feel safe", 2: "No, would not feel safe", 98: "DK", 99: "Refused"} + dict_q8 = {1: "Yes, would feel safe", 2: "No, would not feel safe", 98: "DK(cars)", 99: "Refused(cars)"} # Create a list of DataFrames for each column_to_split_by for question 8 df_q8_list = [] @@ -95,43 +103,28 @@ def run(dest_dir: str) -> None: "No, would not feel safe", "Mostly harm", "Neither", + "DK(help/harm)", + "Refused(help/harm)", + "DK(cars)", + "Refused(cars)", + "Don't have an opinion", ] ] .dropna(subset=["country"]) .copy() ) - # Select rows with categories (NaN country rows) - world_df = df_merge[df_merge["country"].isna()].copy() - world_df.reset_index(drop=True, inplace=True) - # Set country as World - world_df["country"] = world_df["country"].astype(str) - world_df.loc[world_df["country"] == "nan", "country"] = "World" - # Calculates the percentage of valid responses for the "Mostly help" column in a DataFrame, split by gender, income etc. - conc_df_help = pivot_by_category(world_df, "Mostly help") - conc_df_harm = pivot_by_category(world_df, "Mostly harm") - conc_df_neither = pivot_by_category(world_df, "Neither") + merge_rest = calculate_world_data(df_merge, df_without_categories) - merge_help_harm = pd.merge(conc_df_help, conc_df_harm, on=["year", "country"], how="outer") - merge_help_harm_neither = pd.merge(merge_help_harm, conc_df_neither, on=["year", "country"], how="outer") - - # Calculates the percentage of valid responses for a "Yes, would feel safe column in a DataFrame, split by gender, income etc. - conc_df_yes = pivot_by_category(world_df, "Yes, would feel safe") - conc_df_no = pivot_by_category(world_df, "No, would not feel safe") - merge_yes_no = pd.merge(conc_df_yes, conc_df_no, on=["year", "country"], how="outer") + tb = Table(merge_rest, short_name=paths.short_name, underscore=True) - # Merge all dataframes into one - merge_categorized = pd.merge(merge_help_harm_neither, merge_yes_no, on=["year", "country"], how="outer") - merge_rest = pd.merge(df_without_categories, merge_categorized, on=["year", "country"], how="outer") - merge_rest["other_yes_no"] = 100 - (merge_rest["Yes, would feel safe"] + merge_rest["No, would not feel safe"]) - merge_rest["other_help_harm"] = 100 - ( - merge_rest["Mostly help"] + merge_rest["Mostly harm"] + merge_rest["Neither"] + tb["dk_no_op"] = tb[["dk__help_harm", "dont_have_an_opinion"]].sum(axis=1).values + tb["other_help_harm"] = tb[["dk__help_harm", "dont_have_an_opinion", "refused__help_harm"]].sum(axis=1).values + tb["other_yes_no"] = tb[["dk__cars", "refused__cars"]].sum(axis=1).values + tb[["dk_no_op", "other_help_harm", "other_yes_no"]] = tb[["dk_no_op", "other_help_harm", "other_yes_no"]].replace( + 0.0, np.NaN ) - merge_rest.set_index(["year", "country"], inplace=True) - - tb = Table(merge_rest, short_name=paths.short_name, underscore=True) - # # Save outputs. # @@ -150,7 +143,8 @@ def calculate_percentage(df, column, valid_responses_dict, column_to_split_by): Args: df (DataFrame): The input DataFrame. column (str): The column name to calculate the percentage. - valid_responses_dict (dict): A dictionary mapping valid response codes to their corresponding labels. + valid_responses_dict (dict): A dictionary mapping vali + d response codes to their corresponding labels. column_to_split_by (str): The column name to split by. Returns: DataFrame: A DataFrame with columns: the column_to_split_by, "year", "column", "count", and "percentage". @@ -210,9 +204,67 @@ def question_extract(q, df, column_to_split_by, dict_q): pivoted_df.columns.name = None if q == "q9": - return pivoted_df[["year", column_to_split_by, "Mostly help", "Mostly harm", "Neither"]] + return pivoted_df[ + [ + "year", + column_to_split_by, + "Mostly help", + "Mostly harm", + "Neither", + "Don't have an opinion", + "DK(help/harm)", + "Refused(help/harm)", + ] + ] else: - return pivoted_df[["year", column_to_split_by, "Yes, would feel safe", "No, would not feel safe"]] + return pivoted_df[ + ["year", column_to_split_by, "Yes, would feel safe", "No, would not feel safe", "DK(cars)", "Refused(cars)"] + ] + + +def calculate_world_data(df_merge, df_without_categories): + # Select rows with categories (NaN country rows) + world_df = df_merge[df_merge["country"].isna()].copy() + world_df.reset_index(drop=True, inplace=True) + + # Set country as World + world_df["country"] = world_df["country"].astype(str) + world_df.loc[world_df["country"] == "nan", "country"] = "World" + + # Calculate the percentage of valid responses for "Mostly help", "Mostly harm", "Neither" in a DataFrame, + # split by gender, income etc. + columns_to_calculate = [ + "Mostly help", + "Mostly harm", + "Neither", + "DK(help/harm)", + "Don't have an opinion", + "Refused(help/harm)", + ] + merge_help_harm_all = None + for column in columns_to_calculate: + conc_df = pivot_by_category(world_df, column) + if merge_help_harm_all is None: + merge_help_harm_all = conc_df + else: + merge_help_harm_all = pd.merge(merge_help_harm_all, conc_df, on=["year", "country"], how="outer") + + # Calculate the percentage of valid responses for "Yes, would feel safe" in a DataFrame, split by gender, income etc. + columns_to_calculate = ["Yes, would feel safe", "No, would not feel safe", "DK(cars)", "Refused(cars)"] + merge_yes_no = None + for column in columns_to_calculate: + conc_df = pivot_by_category(world_df, column) + if merge_yes_no is None: + merge_yes_no = conc_df + else: + merge_yes_no = pd.merge(merge_yes_no, conc_df, on=["year", "country"], how="outer") + + # Merge all dataframes into one + merge_categorized = pd.merge(merge_help_harm_all, merge_yes_no, on=["year", "country"], how="outer") + merge_rest = pd.merge(df_without_categories, merge_categorized, on=["year", "country"], how="outer") + + merge_rest.set_index(["year", "country"], inplace=True) + return merge_rest def map_values(df): diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.meta.yml b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.meta.yml index a21266e7436..38f59eadcc9 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.meta.yml +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.meta.yml @@ -66,16 +66,25 @@ tables: display: numDecimalPlaces: 0 - other_help_harm: - title: "Don't have an opinion, Don't know, or Refused - help/harm question" - description: Share of respondents who refused to answer the question or said "Don't have an opinion" to the question "Will Artificial Intelligence help or harm people in the next 20 years?". + dk_no_op_value: + title: "Don't have an opinion, Don't know help/harm question" + description: Share of respondents said "Don't have an opinion" or "Don't know" to the question "Will Artificial Intelligence help or harm people in the next 20 years?". + unit: '%' + short_unit: '%' + display: + name: "No opinion or don't know" + numDecimalPlaces: 0 + + other_help_harm_value: + title: "Don't have an opinion, Don't know help/harm question or refused" + description: Share of respondents said "Don't have an opinion", "Don't know" or refused the question "Will Artificial Intelligence help or harm people in the next 20 years?". unit: '%' short_unit: '%' display: name: "Other" numDecimalPlaces: 0 - other_yes_no: + other_yes_no_value: title: "Don't know or Refused - self-driving cars question" description: Share of respondents who refused to answer the question or said "Don't know" to the question "Would you feel safe in a car driven by a computer without a human driver?". unit: '%' @@ -83,3 +92,30 @@ tables: display: name: "Other" numDecimalPlaces: 0 + + refused__help_harm_value: + title: "Refused - help/harm question" + description: Share of respondents who refused to answer the question "Will Artificial Intelligence help or harm people in the next 20 years?". + unit: '%' + short_unit: '%' + display: + name: "No response" + numDecimalPlaces: 0 + + dk__cars_value: + title: "Don't know - self-driving cars question" + description: Share of respondents who refused to answer the question or said "Don't know" to the question "Would you feel safe in a car driven by a computer without a human driver?". + unit: '%' + short_unit: '%' + display: + name: "Don't know" + numDecimalPlaces: 0 + + refused__cars_value: + title: "Refused - self-driving cars question" + description: Share of respondents who refused to answer the question or said "Don't know" to the question "Would you feel safe in a car driven by a computer without a human driver?". + unit: '%' + short_unit: '%' + display: + name: "No response" + numDecimalPlaces: 0 \ No newline at end of file diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.py b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.py index 433fd931737..978eeb8d97e 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.py @@ -1,6 +1,6 @@ """Load a meadow dataset and create a garden dataset.""" -from typing import cast +from typing import List, cast import pandas as pd from owid.catalog import Dataset, Table @@ -14,32 +14,20 @@ paths = PathFinder(__file__) -# Function to melt and clean dataframe based on column name -def melt_and_clean(df, col_name): - excluded_columns = [ - "yes__would_feel_safe", - "mostly_help", - "no__would_not_feel_safe", - "mostly_harm", - "other_help_harm", - "other_yes_no", - "neither", - ] - +def melt_and_clean(df: pd.DataFrame, col_name: str, excluded_columns: List[str]) -> pd.DataFrame: + """ + Melt and clean dataframe based on column name. + """ melted_df = pd.melt( df.reset_index(), id_vars=["year", "country"], value_vars=[col for col in df.columns if col_name in col and col not in excluded_columns], ) - melted_df[col_name] = ( + melted_df["group"] = ( melted_df["variable"].str.split("_" + col_name, expand=True)[0].str.replace("_", " ").str.title() ) melted_df.rename(columns={"value": f"{col_name}_value"}, inplace=True) - melted_df.rename(columns={col_name: "group"}, inplace=True) - melted_df = melted_df[melted_df[f"{col_name}_value"].notnull()] - melted_df.reset_index(drop=True, inplace=True) - return melted_df[["year", f"{col_name}_value", "group"]] @@ -48,28 +36,58 @@ def run(dest_dir: str) -> None: # Load meadow dataset. ds_garden = cast(Dataset, paths.load_dependency("ai_wrp_2021")) - - # Read table from meadow dataset. df = pd.DataFrame(ds_garden["ai_wrp_2021"]) - # Melt and clean dataframes - melted_yes = melt_and_clean(df, "yes__would_feel_safe").dropna(subset=["yes__would_feel_safe_value"]) - melted_no = melt_and_clean(df, "no__would_not_feel_safe").dropna(subset=["no__would_not_feel_safe_value"]) - merge_yes_no = pd.merge(melted_yes, melted_no, on=["year", "group"], how="outer") - melted_help = melt_and_clean(df, "mostly_help").dropna(subset=["mostly_help_value"]) - melted_harm = melt_and_clean(df, "mostly_harm").dropna(subset=["mostly_harm_value"]) - melted_neither = melt_and_clean(df, "neither").dropna(subset=["neither_value"]) - merge_help_harm = pd.merge(melted_help, melted_harm, on=["year", "group"], how="outer") - merge_help_harm_neither = pd.merge(merge_help_harm, melted_neither, on=["year", "group"], how="outer") - merge_all = pd.merge(merge_yes_no, merge_help_harm_neither, on=["year", "group"], how="outer") - - merge_all["other_help_harm"] = 100 - ( - merge_all["mostly_help_value"] + merge_all["mostly_harm_value"] + merge_all["neither_value"] - ) + columns_to_melt = [ + "yes__would_feel_safe", + "no__would_not_feel_safe", + "dk__cars", + "refused__cars", + "mostly_help", + "mostly_harm", + "neither", + "dk__help_harm", + "dont_have_an_opinion", + "refused__help_harm", + ] + + # Define a common list of excluded columns. + excluded_columns = [ + "yes__would_feel_safe", + "mostly_help", + "no__would_not_feel_safe", + "mostly_harm", + "other_yes_no", + "other_help_harm", + "neither", + "refused__cars", + "dk__cars", + "refused__help_harm", + "dk_no_op", + "dk__help_harm", + "dont_have_an_opinion", + ] + + # Using a dictionary to store the melted dataframes. + melted_dfs = {} + + for column in columns_to_melt: + melted_dfs[column] = melt_and_clean(df, column, excluded_columns) - merge_all["other_yes_no"] = 100 - ( - merge_all["yes__would_feel_safe_value"] + merge_all["no__would_not_feel_safe_value"] + merge_all = melted_dfs[columns_to_melt[0]] + + # Merge all melted dataframes together. + for column in columns_to_melt[1:]: + merge_all = pd.merge(merge_all, melted_dfs[column], on=["year", "group"], how="outer") + + # Derive additional columns (mainly to avoid grapher errors) + merge_all["other_yes_no_value"] = merge_all["dk__cars_value"] + merge_all["refused__cars_value"] + merge_all["other_help_harm_value"] = ( + merge_all["dk__help_harm_value"] + + merge_all["dont_have_an_opinion_value"] + + merge_all["refused__help_harm_value"] ) + merge_all["dk_no_op_value"] = merge_all["dk__help_harm_value"] + merge_all["dont_have_an_opinion_value"] # Rename group values group_replacements = { @@ -92,8 +110,8 @@ def run(dest_dir: str) -> None: "Employed Part Time Do Not Want Full Time": "Employed Part-Time (Not seeking Full-Time)", "Employed Part Time Want Full Time": "Employed Part-Time (Seeking Full-Time)", } - merge_all["group"].replace(group_replacements, inplace=True) + merge_all["group"].replace(group_replacements, inplace=True) merge_all.set_index(["year", "group"], inplace=True) # Create a new garden dataset with the same metadata as the meadow dataset. diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.meta.yml b/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.meta.yml index 41c6a5320ed..98abc0db8e2 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.meta.yml +++ b/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.meta.yml @@ -1,12 +1,12 @@ dataset: - title: Large Language Model performance and compute, EPOCH (2023) + title: Large Language Model performance and compute, Epoch (2023) description: EPOCH dataset on how performance on a MMLU language benchmark scales with computational resources. licenses: - name: Creative Commons BY 4.0 sources: - - name: EPOCH (2023) - description: EPOCH dataset on how performance on a MMLU language benchmark scales with computational resources. - url: provided directly by the source + - name: Epoch (2023) + description: Epoch dataset on how performance on a MMLU language benchmark scales with computational resources. + url: https://docs.google.com/spreadsheets/d/1HSGbUVwGy3XLuChH_H16Keux2jmVfKT9rfDrC3uu-SQ/edit?usp=sharing date_accessed: '2023-07-12' publication_date: '2023-07-12' publication_year: 1994 @@ -71,5 +71,5 @@ tables: The training computation used can vary depending on factors such as the size of the dataset, size and complexity of the system architecture, and the level of parallelism used during training, among other reasons. display: - title: Training compute + title: Training computation (petaFLOP) numDecimalPlaces: 0 diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.py b/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.py index 698d1aa4efa..47684ced76d 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.py @@ -19,7 +19,6 @@ def run(dest_dir: str) -> None: # Load meadow dataset. snap = cast(Snapshot, paths.load_dependency("epoch_llms.csv")) df = pd.read_csv(snap.path) - df["Architecture"] = df.apply(add_asterisks, axis=1) df["training_computation_petaflop"] = df["Approx Compute (FLOP)"] / 1e15 df.drop("Approx Compute (FLOP)", axis=1, inplace=True) df["MMLU avg"] *= 100 @@ -33,26 +32,3 @@ def run(dest_dir: str) -> None: # Save changes in the new garden dataset. ds_garden.save() - - -def add_asterisks(row): - if row["Architecture"] == "Gopher": - if row["MMLU avg"] <= 0.26: # <= 1 billion - return "Gopher" + "*" - elif row["MMLU avg"] <= 0.28: # <= 10 billion - return "Gopher" + "**" - elif row["MMLU avg"] <= 0.30: # <= 100 billion - return "Gopher" + "***" - else: - return "Gopher" + "****" - elif row["Architecture"] == "PaLM": - if row["MMLU avg"] <= 0.26: # <= 10 billion - return "PaLM" + "*" - elif row["MMLU avg"] <= 0.54: # <= 100 billion - return "PaLM" + "**" - elif row["MMLU avg"] <= 0.63: # <= 1 trillion - return "PaLM" + "***" - else: - return "PaLM" + "****" - else: - return row["Architecture"] diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.countries.json b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.countries.json new file mode 100644 index 00000000000..2c63c085104 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.countries.json @@ -0,0 +1,2 @@ +{ +} diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.excluded_countries.json b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.excluded_countries.json new file mode 100644 index 00000000000..0d4f101c7a3 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.excluded_countries.json @@ -0,0 +1,2 @@ +[ +] diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.meta.yml b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.meta.yml new file mode 100644 index 00000000000..ff28c709bdf --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.meta.yml @@ -0,0 +1,35 @@ +# (Inherited from meadow, remove if not different.) +all_sources: + - source_testing: &source-testing + name: # Example: Testing Short Citation + published_by: # (if different to short citation). Example: Testing Full Citation + url: # Example: https://url_of_testing_source.com/ + date_accessed: # Example: 2023-01-01 + publication_date: # Example: 2023-01-01 + publication_year: # (if publication_date is not given). Example: 2023 + # description: Source description. + +# (Inherited from meadow, remove if not different.) +dataset: + title: # Example: Testing Dataset Name (Institution, 2023) + # description: Dataset description. + licenses: + - name: # Example: Testing License Name + url: # Example: https://url_of_testing_source.com/license + sources: + - *source-testing + +tables: + cset: + # (Inherited from meadow, remove if not different.) + variables: + # testing_variable: + # title: Testing variable title + # unit: arbitrary units + # short_unit: au + # description: Full description of testing variable. + # sources: + # - *source-testing + # display: + # entityAnnotationsMap: Test annotation + # numDecimalPlaces: 0 diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.py b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.py new file mode 100644 index 00000000000..ada768e24a2 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.py @@ -0,0 +1,38 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from typing import cast + +from owid.catalog import Dataset, Table + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = cast(Dataset, paths.load_dependency("cset")) + + # Read table from meadow dataset. + tb = ds_meadow["cset"] + + # + # Process data. + # + tb: Table = geo.harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + ) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2023-07-23/cset.py b/etl/steps/data/grapher/artificial_intelligence/2023-07-23/cset.py new file mode 100644 index 00000000000..182bbfc359e --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2023-07-23/cset.py @@ -0,0 +1,39 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from typing import cast + +from owid.catalog import Dataset + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = cast(Dataset, paths.load_dependency("cset")) + + # Read table from garden dataset. + tb = ds_garden["cset"] + + # + # Process data. + # + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/artificial_intelligence/2023-07-23/cset.py b/etl/steps/data/meadow/artificial_intelligence/2023-07-23/cset.py new file mode 100644 index 00000000000..96f35526e23 --- /dev/null +++ b/etl/steps/data/meadow/artificial_intelligence/2023-07-23/cset.py @@ -0,0 +1,38 @@ +"""Load a snapshot and create a meadow dataset.""" + +from typing import cast + +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset +from etl.snapshot import Snapshot + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = cast(Snapshot, paths.load_dependency("cset.csv")) + + # Load data from snapshot. + df = pd.read_csv(snap.path, low_memory=False) + + # + # Process data. + # + # Create a new table and ensure all columns are snake-case. + tb = Table(df, short_name=paths.short_name, underscore=True) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # Save changes in the new garden dataset. + ds_meadow.save() diff --git a/snapshots/artificial_intelligence/2023-07-12/epoch_llms.csv.dvc b/snapshots/artificial_intelligence/2023-07-12/epoch_llms.csv.dvc index a3516f49428..e33f52e3f45 100644 --- a/snapshots/artificial_intelligence/2023-07-12/epoch_llms.csv.dvc +++ b/snapshots/artificial_intelligence/2023-07-12/epoch_llms.csv.dvc @@ -1,21 +1,22 @@ meta: - name: Large Language Model performance and compute, EPOCH (2023) + name: Large Language Model performance and compute, Epoch (2023) publication_year: 1994 publication_date: '2023-07-12' - source_name: EPOCH (2023) + source_name: Epoch (2023) source_published_by: "David Owen (2023), Extrapolating performance in language modeling benchmarks. Published online at epochai.org. Retrieved from: 'https://epochai.org/blog/extrapolating-performance-in-language-modelling-benchmarks' [online resource]" - url: provided directly by the source + url: + https://docs.google.com/spreadsheets/d/1HSGbUVwGy3XLuChH_H16Keux2jmVfKT9rfDrC3uu-SQ/edit?usp=sharing source_data_url: license_url: license_name: Creative Commons BY 4.0 date_accessed: 2023-07-12 is_public: true description: | - EPOCH dataset on how performance on a MMLU language benchmark scales with computational resources. + Epoch dataset on how performance on a MMLU language benchmark scales with computational resources. wdir: ../../../data/snapshots/artificial_intelligence/2023-07-12 outs: -- md5: fcdc4e018fc6fc930a2227819cd45ff3 - size: 1523 +- md5: 30e9dea32e9a2952e2f021f90290eb54 + size: 1574 path: epoch_llms.csv diff --git a/snapshots/artificial_intelligence/2023-07-23/cset.csv.dvc b/snapshots/artificial_intelligence/2023-07-23/cset.csv.dvc new file mode 100644 index 00000000000..a5945d8209e --- /dev/null +++ b/snapshots/artificial_intelligence/2023-07-23/cset.csv.dvc @@ -0,0 +1,28 @@ +meta: + name: 'Country Activity Tracker: Artificial Intelligence (Center for Security and + Emerging Technology, 2023)' + publication_year: 2023 + + publication_date: '2023-07-21' + source_name: Center for Security and Emerging Technology (2023) + source_published_by: Emerging Technology Observatory Country Activity Tracker, Artificial + Intelligence (Center for Security and Emerging Technology, 2023) + url: https://cat.eto.tech/ + source_data_url: + license_url: https://eto.tech/tou/ + license_name: Creative Commons BY 4.0 + date_accessed: 2023-07-23 + is_public: true + description: | + The research data in CAT (Country Attributes and Topics) is derived from ETO's Merged Academic Corpus (MAC), which contains detailed information on over 270 million scholarly articles worldwide. CAT uses only AI-related articles from the MAC. Articles are attributed to countries based on the author organizations listed in each article's metadata. An article is attributed to a country if it lists at least one author affiliated with an organization in that country. + + The top ten authors for each country are identified based on the number of citations to articles they released while affiliated with institutions in that country. CAT classifies articles into AI subfields using subject assignment scores in the MAC. Articles are assigned to up to three subfields based on their scores. + + CAT includes patent data from 1790 Analytics and Dimensions, and it counts AI-related patent families, including patent applications and granted patents. Patents are attributed to the country where they are filed, not necessarily the inventor's nationality. CAT also uses Crunchbase data to identify AI-related companies based on various criteria and includes investment metrics for these companies. + + The data in CAT is updated at least once a quarter, with plans for more frequent updates in the future. +wdir: ../../../data/snapshots/artificial_intelligence/2023-07-23 +outs: +- md5: afe33f3d16404cf7527a43c5189847e8 + size: 1658442 + path: cset.csv diff --git a/snapshots/artificial_intelligence/2023-07-23/cset.py b/snapshots/artificial_intelligence/2023-07-23/cset.py new file mode 100644 index 00000000000..303e5ca8464 --- /dev/null +++ b/snapshots/artificial_intelligence/2023-07-23/cset.py @@ -0,0 +1,60 @@ +"""Script to create a snapshot of dataset 'Country Activity Tracker: Artificial Intelligence (Center for Security and Emerging Technology, 2023)'.""" + +from pathlib import Path + +import click +import pandas as pd +from owid.datautils.io import df_to_file + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"artificial_intelligence/{SNAPSHOT_VERSION}/cset.csv") + common_path = "/Users/veronikasamborska/Downloads/owid_cat_data_20230717/" + + files = { + "companies": ["companies_yearly_disclosed.csv", "companies_yearly_estimated.csv"], + # "patents": ["patents_yearly_applications.csv", "patents_yearly_granted.csv"], + "articles": ["publications_yearly_articles.csv", "publications_yearly_citations.csv"], + } + + all_dfs = [] + for field, file_ids in files.items(): + all_dfs.append(read_and_clean_data(file_ids, common_path, field)) + + result = pd.concat(all_dfs) + df_to_file(result, file_path=snap.path) + # Add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +def read_and_clean_data(file_ids, common_path, field_name): + all_dfs_list = [] + for id in file_ids: + df_add = pd.read_csv(common_path + id) + if "estimated" in id: + df_add.rename(columns={"disclosed_investment": "disclosed_investment_estimated"}, inplace=True) + all_dfs_list.append(df_add) + + merged_df = all_dfs_list[0] + for df in all_dfs_list[1:]: + merged_df = pd.merge(merged_df, df, on=["year", "country", "field"]) + + merged_df.rename(columns={"field": field_name}, inplace=True) + return merged_df + + +if __name__ == "__main__": + main()