From 7369b0c20f5edc1e4dc465d63fe30e113430718b Mon Sep 17 00:00:00 2001
From: veronikasamborska1994 <veronika@ourworldindata.org>
Date: Mon, 24 Jul 2023 14:39:20 +0200
Subject: [PATCH] small AI fixes + CSET dataset on investment

---
 dag/artificial_intelligence.yml               |   8 ++
 dag/walkthrough.yml                           |   2 +-
 .../2023-06-14/ai_national_strategy.py        |   2 +-
 .../2023-06-14/ai_robots.meta.yml             |  12 +-
 .../2023-06-14/ai_robots.py                   |   1 +
 .../2023-06-26/ai_wrp_2021.meta.yml           |  40 +++++-
 .../2023-06-26/ai_wrp_2021.py                 | 116 +++++++++++++-----
 .../2023-06-26/ai_wrp_2021_grouped.meta.yml   |  44 ++++++-
 .../2023-06-26/ai_wrp_2021_grouped.py         |  92 ++++++++------
 .../2023-07-12/epoch_llms.meta.yml            |  10 +-
 .../2023-07-12/epoch_llms.py                  |  24 ----
 .../2023-07-23/cset.countries.json            |   2 +
 .../2023-07-23/cset.excluded_countries.json   |   2 +
 .../2023-07-23/cset.meta.yml                  |  35 ++++++
 .../2023-07-23/cset.py                        |  38 ++++++
 .../2023-07-23/cset.py                        |  39 ++++++
 .../2023-07-23/cset.py                        |  38 ++++++
 .../2023-07-12/epoch_llms.csv.dvc             |  13 +-
 .../2023-07-23/cset.csv.dvc                   |  28 +++++
 .../2023-07-23/cset.py                        |  60 +++++++++
 20 files changed, 492 insertions(+), 114 deletions(-)
 create mode 100644 etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.countries.json
 create mode 100644 etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.excluded_countries.json
 create mode 100644 etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.meta.yml
 create mode 100644 etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.py
 create mode 100644 etl/steps/data/grapher/artificial_intelligence/2023-07-23/cset.py
 create mode 100644 etl/steps/data/meadow/artificial_intelligence/2023-07-23/cset.py
 create mode 100644 snapshots/artificial_intelligence/2023-07-23/cset.csv.dvc
 create mode 100644 snapshots/artificial_intelligence/2023-07-23/cset.py

diff --git a/dag/artificial_intelligence.yml b/dag/artificial_intelligence.yml
index 930b933847b0..3a8cb2a4fa59 100644
--- a/dag/artificial_intelligence.yml
+++ b/dag/artificial_intelligence.yml
@@ -205,3 +205,11 @@ steps:
   - data://meadow/artificial_intelligence/2023-07-07/semiconductors_cset
   data://grapher/artificial_intelligence/2023-07-07/semiconductors_cset:
   - data://garden/artificial_intelligence/2023-07-07/semiconductors_cset
+
+# CSET data on patents, articles and private investment
+  data://meadow/artificial_intelligence/2023-07-23/cset:
+  - snapshot://artificial_intelligence/2023-07-23/cset.csv
+  data://garden/artificial_intelligence/2023-07-23/cset:
+  - data://meadow/artificial_intelligence/2023-07-23/cset
+  data://grapher/artificial_intelligence/2023-07-23/cset:
+  - data://garden/artificial_intelligence/2023-07-23/cset
diff --git a/dag/walkthrough.yml b/dag/walkthrough.yml
index 75d8204ec072..413e39c0dbe7 100644
--- a/dag/walkthrough.yml
+++ b/dag/walkthrough.yml
@@ -7,4 +7,4 @@ steps:
   data://grapher/dummy/2020-01-01/dummy:
   - data://garden/dummy/2020-01-01/dummy
   data://explorers/dummy/2020-01-01/dummy:
-  - data://garden/dummy/2020-01-01/dummy
\ No newline at end of file
+  - data://garden/dummy/2020-01-01/dummy
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py
index baebc7887856..9fa62b488c53 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py
+++ b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py
@@ -61,7 +61,7 @@ def run(dest_dir: str) -> None:
             group["released_national_strategy_on_ai"].fillna(method="ffill", inplace=True)
 
         # Fill remaining NaN values with "Not Released"
-        group["released_national_strategy_on_ai"].fillna("Not Released", inplace=True)
+        group["released_national_strategy_on_ai"].fillna("Not released", inplace=True)
         df_merged.loc[group.index] = group
     df_merged.drop("released", axis=1, inplace=True)
     tb = Table(df_merged, short_name=paths.short_name, underscore=True)
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.meta.yml b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.meta.yml
index ee27a599c35f..61bf72e42873 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.meta.yml
+++ b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.meta.yml
@@ -184,6 +184,7 @@ tables:
         unit: 'robots'
         description: "Industrial robots are defined as “automatically controlled, reprogrammable, multipurpose manipulator, programmable in three or more axes, which can be either fixed in place or mobile for use in industrial automation applications."
         display:
+            name: Total industrial robots in operation
             numDecimalPlaces: 0
 
       number_of_industrial_robots_installed_2021:
@@ -191,7 +192,7 @@ tables:
         unit: 'robots'
         description: "Industrial robots are defined as “automatically controlled, reprogrammable, multipurpose manipulator, programmable in three or more axes, which can be either fixed in place or mobile for use in industrial automation applications."
         display:
-          name: Number of industrial robots installed
+          name: Total of industrial robots installed
           numDecimalPlaces: 0
 
       annual_count__number_of_industrial_robots_installed:
@@ -199,7 +200,7 @@ tables:
         unit: 'robots'
         description: "Industrial robots are defined as “automatically controlled, reprogrammable, multipurpose manipulator, programmable in three or more axes, which can be either fixed in place or mobile for use in industrial automation applications."
         display:
-          name: Number of industrial robots installed
+          name: Annual industrial robots installed
           numDecimalPlaces: 0
 
       new_robots_installed__number_of_industrial_robots_installed:
@@ -208,4 +209,11 @@ tables:
         description: "Industrial robots are defined as “automatically controlled, reprogrammable, multipurpose manipulator, programmable in three or more axes, which can be either fixed in place or mobile for use in industrial automation applications."
         display:
           name: Number of industrial robots installed
+          numDecimalPlaces: 0
+      unspecified_others:
+        title: Unspecified or Other Sector
+        unit: 'robots'
+        description: "Industrial robots are defined as “automatically controlled, reprogrammable, multipurpose manipulator, programmable in three or more axes, which can be either fixed in place or mobile for use in industrial automation applications."
+        display:
+          name: Unspecified or other
           numDecimalPlaces: 0
\ No newline at end of file
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.py b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.py
index d09cb646c392..725b4e62e5c9 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.py
+++ b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_robots.py
@@ -101,6 +101,7 @@ def run(dest_dir: str) -> None:
 
     # Merge pivot table for professional service robots, application area and sector with aggregates
     merge_all = pd.merge(merge_service, df_agg_clean, on=["year", "country"], how="outer")
+    merge_all["unspecified_others"] = merge_all["Unspecified Sector"] + merge_all["All others"]
 
     # Set the index as 'country' and 'year'
     merge_all.set_index(["country", "year"], inplace=True)
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.meta.yml b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.meta.yml
index 38694c3ec5af..636f25e83fbf 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.meta.yml
+++ b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.meta.yml
@@ -66,9 +66,18 @@ tables:
         display:
           numDecimalPlaces: 0
 
+      dk_no_op:
+        title: "Don't have an opinion, Don't know help/harm question"
+        description: Share of respondents said "Don't have an opinion" or "Don't know" to the question "Will Artificial Intelligence help or harm people in the next 20 years?".
+        unit: '%'
+        short_unit: '%'
+        display:
+          name: "No opinion or don't know"
+          numDecimalPlaces: 0
+
       other_help_harm:
-        title: "Don't have an opinion, Don't know, or Refused - help/harm question"
-        description: Share of respondents who refused to answer the question or said "Don't have an opinion" to the question "Will Artificial Intelligence help or harm people in the next 20 years?".
+        title: "Don't have an opinion, Don't know help/harm question or refused"
+        description: Share of respondents said "Don't have an opinion", "Don't know" or refused the question "Will Artificial Intelligence help or harm people in the next 20 years?".
         unit: '%'
         short_unit: '%'
         display:
@@ -83,3 +92,30 @@ tables:
         display:
           name: "Other"
           numDecimalPlaces: 0
+
+      refused__help_harm:
+        title: "Refused - help/harm question"
+        description: Share of respondents who refused to answer the question "Will Artificial Intelligence help or harm people in the next 20 years?".
+        unit: '%'
+        short_unit: '%'
+        display:
+          name: "No response"
+          numDecimalPlaces: 0
+
+      dk__cars:
+        title: "Don't know - self-driving cars question"
+        description: Share of respondents who refused to answer the question or said "Don't know" to the question "Would you feel safe in a car driven by a computer without a human driver?".
+        unit: '%'
+        short_unit: '%'
+        display:
+          name: "Don't know"
+          numDecimalPlaces: 0
+
+      refused__cars:
+        title: "Refused - self-driving cars question"
+        description: Share of respondents who refused to answer the question or said "Don't know" to the question "Would you feel safe in a car driven by a computer without a human driver?".
+        unit: '%'
+        short_unit: '%'
+        display:
+          name: "No response"
+          numDecimalPlaces: 0
\ No newline at end of file
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py
index 30d5c95a33b6..bfe7cd4e1c7a 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py
+++ b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py
@@ -2,6 +2,7 @@
 
 from typing import cast
 
+import numpy as np
 import pandas as pd
 from owid.catalog import Dataset, Table
 from structlog import get_logger
@@ -57,10 +58,17 @@ def run(dest_dir: str) -> None:
     columns_to_split_by = ["country", "gender", "education", "income_5", "emp_2010", "agegroups4", "globalregion"]
 
     # Dictionary to map response codes to labels for question 9
-    dict_q9 = {1: "Mostly help", 2: "Mostly harm", 3: "Don't have an opinion", 4: "Neither", 98: "DK", 99: "Refused"}
+    dict_q9 = {
+        1: "Mostly help",
+        2: "Mostly harm",
+        3: "Don't have an opinion",
+        4: "Neither",
+        98: "DK(help/harm)",
+        99: "Refused(help/harm)",
+    }
 
     # Dictionary to map response codes to labels for question 8
-    dict_q8 = {1: "Yes, would feel safe", 2: "No, would not feel safe", 98: "DK", 99: "Refused"}
+    dict_q8 = {1: "Yes, would feel safe", 2: "No, would not feel safe", 98: "DK(cars)", 99: "Refused(cars)"}
 
     # Create a list of DataFrames for each column_to_split_by for question 8
     df_q8_list = []
@@ -95,43 +103,28 @@ def run(dest_dir: str) -> None:
                 "No, would not feel safe",
                 "Mostly harm",
                 "Neither",
+                "DK(help/harm)",
+                "Refused(help/harm)",
+                "DK(cars)",
+                "Refused(cars)",
+                "Don't have an opinion",
             ]
         ]
         .dropna(subset=["country"])
         .copy()
     )
-    # Select rows with categories (NaN country rows)
-    world_df = df_merge[df_merge["country"].isna()].copy()
-    world_df.reset_index(drop=True, inplace=True)
 
-    # Set country as World
-    world_df["country"] = world_df["country"].astype(str)
-    world_df.loc[world_df["country"] == "nan", "country"] = "World"
-    # Calculates the percentage of valid responses for the "Mostly help" column in a DataFrame, split by gender, income etc.
-    conc_df_help = pivot_by_category(world_df, "Mostly help")
-    conc_df_harm = pivot_by_category(world_df, "Mostly harm")
-    conc_df_neither = pivot_by_category(world_df, "Neither")
+    merge_rest = calculate_world_data(df_merge, df_without_categories)
 
-    merge_help_harm = pd.merge(conc_df_help, conc_df_harm, on=["year", "country"], how="outer")
-    merge_help_harm_neither = pd.merge(merge_help_harm, conc_df_neither, on=["year", "country"], how="outer")
-
-    # Calculates the percentage of valid responses for a "Yes, would feel safe column in a DataFrame, split by gender, income etc.
-    conc_df_yes = pivot_by_category(world_df, "Yes, would feel safe")
-    conc_df_no = pivot_by_category(world_df, "No, would not feel safe")
-    merge_yes_no = pd.merge(conc_df_yes, conc_df_no, on=["year", "country"], how="outer")
+    tb = Table(merge_rest, short_name=paths.short_name, underscore=True)
 
-    # Merge  all dataframes into one
-    merge_categorized = pd.merge(merge_help_harm_neither, merge_yes_no, on=["year", "country"], how="outer")
-    merge_rest = pd.merge(df_without_categories, merge_categorized, on=["year", "country"], how="outer")
-    merge_rest["other_yes_no"] = 100 - (merge_rest["Yes, would feel safe"] + merge_rest["No, would not feel safe"])
-    merge_rest["other_help_harm"] = 100 - (
-        merge_rest["Mostly help"] + merge_rest["Mostly harm"] + merge_rest["Neither"]
+    tb["dk_no_op"] = tb[["dk__help_harm", "dont_have_an_opinion"]].sum(axis=1).values
+    tb["other_help_harm"] = tb[["dk__help_harm", "dont_have_an_opinion", "refused__help_harm"]].sum(axis=1).values
+    tb["other_yes_no"] = tb[["dk__cars", "refused__cars"]].sum(axis=1).values
+    tb[["dk_no_op", "other_help_harm", "other_yes_no"]] = tb[["dk_no_op", "other_help_harm", "other_yes_no"]].replace(
+        0.0, np.NaN
     )
 
-    merge_rest.set_index(["year", "country"], inplace=True)
-
-    tb = Table(merge_rest, short_name=paths.short_name, underscore=True)
-
     #
     # Save outputs.
     #
@@ -150,7 +143,8 @@ def calculate_percentage(df, column, valid_responses_dict, column_to_split_by):
     Args:
         df (DataFrame): The input DataFrame.
         column (str): The column name to calculate the percentage.
-        valid_responses_dict (dict): A dictionary mapping valid response codes to their corresponding labels.
+        valid_responses_dict (dict): A dictionary mapping vali
+        d response codes to their corresponding labels.
         column_to_split_by (str): The column name to split by.
     Returns:
         DataFrame: A DataFrame with columns: the column_to_split_by, "year", "column", "count", and "percentage".
@@ -210,9 +204,67 @@ def question_extract(q, df, column_to_split_by, dict_q):
     pivoted_df.columns.name = None
 
     if q == "q9":
-        return pivoted_df[["year", column_to_split_by, "Mostly help", "Mostly harm", "Neither"]]
+        return pivoted_df[
+            [
+                "year",
+                column_to_split_by,
+                "Mostly help",
+                "Mostly harm",
+                "Neither",
+                "Don't have an opinion",
+                "DK(help/harm)",
+                "Refused(help/harm)",
+            ]
+        ]
     else:
-        return pivoted_df[["year", column_to_split_by, "Yes, would feel safe", "No, would not feel safe"]]
+        return pivoted_df[
+            ["year", column_to_split_by, "Yes, would feel safe", "No, would not feel safe", "DK(cars)", "Refused(cars)"]
+        ]
+
+
+def calculate_world_data(df_merge, df_without_categories):
+    # Select rows with categories (NaN country rows)
+    world_df = df_merge[df_merge["country"].isna()].copy()
+    world_df.reset_index(drop=True, inplace=True)
+
+    # Set country as World
+    world_df["country"] = world_df["country"].astype(str)
+    world_df.loc[world_df["country"] == "nan", "country"] = "World"
+
+    # Calculate the percentage of valid responses for "Mostly help", "Mostly harm", "Neither" in a DataFrame,
+    # split by gender, income etc.
+    columns_to_calculate = [
+        "Mostly help",
+        "Mostly harm",
+        "Neither",
+        "DK(help/harm)",
+        "Don't have an opinion",
+        "Refused(help/harm)",
+    ]
+    merge_help_harm_all = None
+    for column in columns_to_calculate:
+        conc_df = pivot_by_category(world_df, column)
+        if merge_help_harm_all is None:
+            merge_help_harm_all = conc_df
+        else:
+            merge_help_harm_all = pd.merge(merge_help_harm_all, conc_df, on=["year", "country"], how="outer")
+
+    # Calculate the percentage of valid responses for "Yes, would feel safe" in a DataFrame, split by gender, income etc.
+    columns_to_calculate = ["Yes, would feel safe", "No, would not feel safe", "DK(cars)", "Refused(cars)"]
+    merge_yes_no = None
+    for column in columns_to_calculate:
+        conc_df = pivot_by_category(world_df, column)
+        if merge_yes_no is None:
+            merge_yes_no = conc_df
+        else:
+            merge_yes_no = pd.merge(merge_yes_no, conc_df, on=["year", "country"], how="outer")
+
+    # Merge all dataframes into one
+    merge_categorized = pd.merge(merge_help_harm_all, merge_yes_no, on=["year", "country"], how="outer")
+    merge_rest = pd.merge(df_without_categories, merge_categorized, on=["year", "country"], how="outer")
+
+    merge_rest.set_index(["year", "country"], inplace=True)
+    return merge_rest
 
 
 def map_values(df):
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.meta.yml b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.meta.yml
index a21266e74362..38f59eadcc97 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.meta.yml
+++ b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.meta.yml
@@ -66,16 +66,25 @@ tables:
         display:
           numDecimalPlaces: 0
 
-      other_help_harm:
-        title: "Don't have an opinion, Don't know, or Refused - help/harm question"
-        description: Share of respondents who refused to answer the question or said "Don't have an opinion" to the question "Will Artificial Intelligence help or harm people in the next 20 years?".
+      dk_no_op_value:
+        title: "Don't have an opinion, Don't know help/harm question"
+        description: Share of respondents said "Don't have an opinion" or "Don't know" to the question "Will Artificial Intelligence help or harm people in the next 20 years?".
+        unit: '%'
+        short_unit: '%'
+        display:
+          name: "No opinion or don't know"
+          numDecimalPlaces: 0
+
+      other_help_harm_value:
+        title: "Don't have an opinion, Don't know help/harm question or refused"
+        description: Share of respondents said "Don't have an opinion", "Don't know" or refused the question "Will Artificial Intelligence help or harm people in the next 20 years?".
         unit: '%'
         short_unit: '%'
         display:
           name: "Other"
           numDecimalPlaces: 0
 
-      other_yes_no:
+      other_yes_no_value:
         title: "Don't know or Refused - self-driving cars question"
         description: Share of respondents who refused to answer the question or said "Don't know" to the question "Would you feel safe in a car driven by a computer without a human driver?".
         unit: '%'
@@ -83,3 +92,30 @@ tables:
         display:
           name: "Other"
           numDecimalPlaces: 0
+
+      refused__help_harm_value:
+        title: "Refused - help/harm question"
+        description: Share of respondents who refused to answer the question "Will Artificial Intelligence help or harm people in the next 20 years?".
+        unit: '%'
+        short_unit: '%'
+        display:
+          name: "No response"
+          numDecimalPlaces: 0
+
+      dk__cars_value:
+        title: "Don't know - self-driving cars question"
+        description: Share of respondents who refused to answer the question or said "Don't know" to the question "Would you feel safe in a car driven by a computer without a human driver?".
+        unit: '%'
+        short_unit: '%'
+        display:
+          name: "Don't know"
+          numDecimalPlaces: 0
+
+      refused__cars_value:
+        title: "Refused - self-driving cars question"
+        description: Share of respondents who refused to answer the question or said "Don't know" to the question "Would you feel safe in a car driven by a computer without a human driver?".
+        unit: '%'
+        short_unit: '%'
+        display:
+          name: "No response"
+          numDecimalPlaces: 0
\ No newline at end of file
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.py b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.py
index 433fd9317375..978eeb8d97e4 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.py
+++ b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021_grouped.py
@@ -1,6 +1,6 @@
 """Load a meadow dataset and create a garden dataset."""
 
-from typing import cast
+from typing import List, cast
 
 import pandas as pd
 from owid.catalog import Dataset, Table
@@ -14,32 +14,20 @@
 paths = PathFinder(__file__)
 
 
-# Function to melt and clean dataframe based on column name
-def melt_and_clean(df, col_name):
-    excluded_columns = [
-        "yes__would_feel_safe",
-        "mostly_help",
-        "no__would_not_feel_safe",
-        "mostly_harm",
-        "other_help_harm",
-        "other_yes_no",
-        "neither",
-    ]
-
+def melt_and_clean(df: pd.DataFrame, col_name: str, excluded_columns: List[str]) -> pd.DataFrame:
+    """
+    Melt and clean dataframe based on column name.
+    """
     melted_df = pd.melt(
         df.reset_index(),
         id_vars=["year", "country"],
         value_vars=[col for col in df.columns if col_name in col and col not in excluded_columns],
     )
-    melted_df[col_name] = (
+    melted_df["group"] = (
         melted_df["variable"].str.split("_" + col_name, expand=True)[0].str.replace("_", " ").str.title()
     )
     melted_df.rename(columns={"value": f"{col_name}_value"}, inplace=True)
-    melted_df.rename(columns={col_name: "group"}, inplace=True)
-
     melted_df = melted_df[melted_df[f"{col_name}_value"].notnull()]
-    melted_df.reset_index(drop=True, inplace=True)
-
     return melted_df[["year", f"{col_name}_value", "group"]]
 
 
@@ -48,28 +36,58 @@ def run(dest_dir: str) -> None:
 
     # Load meadow dataset.
     ds_garden = cast(Dataset, paths.load_dependency("ai_wrp_2021"))
-
-    # Read table from meadow dataset.
     df = pd.DataFrame(ds_garden["ai_wrp_2021"])
 
-    # Melt and clean dataframes
-    melted_yes = melt_and_clean(df, "yes__would_feel_safe").dropna(subset=["yes__would_feel_safe_value"])
-    melted_no = melt_and_clean(df, "no__would_not_feel_safe").dropna(subset=["no__would_not_feel_safe_value"])
-    merge_yes_no = pd.merge(melted_yes, melted_no, on=["year", "group"], how="outer")
-    melted_help = melt_and_clean(df, "mostly_help").dropna(subset=["mostly_help_value"])
-    melted_harm = melt_and_clean(df, "mostly_harm").dropna(subset=["mostly_harm_value"])
-    melted_neither = melt_and_clean(df, "neither").dropna(subset=["neither_value"])
-    merge_help_harm = pd.merge(melted_help, melted_harm, on=["year", "group"], how="outer")
-    merge_help_harm_neither = pd.merge(merge_help_harm, melted_neither, on=["year", "group"], how="outer")
-    merge_all = pd.merge(merge_yes_no, merge_help_harm_neither, on=["year", "group"], how="outer")
-
-    merge_all["other_help_harm"] = 100 - (
-        merge_all["mostly_help_value"] + merge_all["mostly_harm_value"] + merge_all["neither_value"]
-    )
+    columns_to_melt = [
+        "yes__would_feel_safe",
+        "no__would_not_feel_safe",
+        "dk__cars",
+        "refused__cars",
+        "mostly_help",
+        "mostly_harm",
+        "neither",
+        "dk__help_harm",
+        "dont_have_an_opinion",
+        "refused__help_harm",
+    ]
+
+    # Define a common list of excluded columns.
+    excluded_columns = [
+        "yes__would_feel_safe",
+        "mostly_help",
+        "no__would_not_feel_safe",
+        "mostly_harm",
+        "other_yes_no",
+        "other_help_harm",
+        "neither",
+        "refused__cars",
+        "dk__cars",
+        "refused__help_harm",
+        "dk_no_op",
+        "dk__help_harm",
+        "dont_have_an_opinion",
+    ]
+
+    # Using a dictionary to store the melted dataframes.
+    melted_dfs = {}
+
+    for column in columns_to_melt:
+        melted_dfs[column] = melt_and_clean(df, column, excluded_columns)
 
-    merge_all["other_yes_no"] = 100 - (
-        merge_all["yes__would_feel_safe_value"] + merge_all["no__would_not_feel_safe_value"]
+    merge_all = melted_dfs[columns_to_melt[0]]
+
+    # Merge all melted dataframes together.
+    for column in columns_to_melt[1:]:
+        merge_all = pd.merge(merge_all, melted_dfs[column], on=["year", "group"], how="outer")
+
+    # Derive additional columns (mainly to avoid grapher errors)
+    merge_all["other_yes_no_value"] = merge_all["dk__cars_value"] + merge_all["refused__cars_value"]
+    merge_all["other_help_harm_value"] = (
+        merge_all["dk__help_harm_value"]
+        + merge_all["dont_have_an_opinion_value"]
+        + merge_all["refused__help_harm_value"]
     )
+    merge_all["dk_no_op_value"] = merge_all["dk__help_harm_value"] + merge_all["dont_have_an_opinion_value"]
 
     # Rename group values
     group_replacements = {
@@ -92,8 +110,8 @@ def run(dest_dir: str) -> None:
         "Employed Part Time Do Not Want Full Time": "Employed Part-Time (Not seeking Full-Time)",
         "Employed Part Time Want Full Time": "Employed Part-Time (Seeking Full-Time)",
     }
-    merge_all["group"].replace(group_replacements, inplace=True)
 
+    merge_all["group"].replace(group_replacements, inplace=True)
     merge_all.set_index(["year", "group"], inplace=True)
 
     # Create a new garden dataset with the same metadata as the meadow dataset.
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.meta.yml b/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.meta.yml
index 41c6a5320ed9..98abc0db8e23 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.meta.yml
+++ b/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.meta.yml
@@ -1,12 +1,12 @@
 dataset:
-  title: Large Language Model performance and compute, EPOCH (2023)
+  title: Large Language Model performance and compute, Epoch (2023)
   description: EPOCH dataset on how performance on a MMLU language benchmark scales with computational resources.
   licenses:
   - name: Creative Commons BY 4.0
   sources:
-  - name: EPOCH (2023)
-    description: EPOCH dataset on how performance on a MMLU language benchmark scales with computational resources.
-    url: provided directly by the source
+  - name: Epoch (2023)
+    description: Epoch dataset on how performance on a MMLU language benchmark scales with computational resources.
+    url:  https://docs.google.com/spreadsheets/d/1HSGbUVwGy3XLuChH_H16Keux2jmVfKT9rfDrC3uu-SQ/edit?usp=sharing
     date_accessed: '2023-07-12'
     publication_date: '2023-07-12'
     publication_year: 1994
@@ -71,5 +71,5 @@ tables:
 
           The training computation used can vary depending on factors such as the size of the dataset, size and complexity of the system architecture, and the level of parallelism used during training, among other reasons.
         display:
-            title: Training compute
+            title: Training computation (petaFLOP)
             numDecimalPlaces: 0
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.py b/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.py
index 698d1aa4efa2..47684ced76d0 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.py
+++ b/etl/steps/data/garden/artificial_intelligence/2023-07-12/epoch_llms.py
@@ -19,7 +19,6 @@ def run(dest_dir: str) -> None:
     # Load meadow dataset.
     snap = cast(Snapshot, paths.load_dependency("epoch_llms.csv"))
     df = pd.read_csv(snap.path)
-    df["Architecture"] = df.apply(add_asterisks, axis=1)
     df["training_computation_petaflop"] = df["Approx Compute (FLOP)"] / 1e15
     df.drop("Approx Compute (FLOP)", axis=1, inplace=True)
     df["MMLU avg"] *= 100
@@ -33,26 +32,3 @@ def run(dest_dir: str) -> None:
 
     # Save changes in the new garden dataset.
     ds_garden.save()
-
-
-def add_asterisks(row):
-    if row["Architecture"] == "Gopher":
-        if row["MMLU avg"] <= 0.26:  # <= 1 billion
-            return "Gopher" + "*"
-        elif row["MMLU avg"] <= 0.28:  # <= 10 billion
-            return "Gopher" + "**"
-        elif row["MMLU avg"] <= 0.30:  # <= 100 billion
-            return "Gopher" + "***"
-        else:
-            return "Gopher" + "****"
-    elif row["Architecture"] == "PaLM":
-        if row["MMLU avg"] <= 0.26:  # <= 10 billion
-            return "PaLM" + "*"
-        elif row["MMLU avg"] <= 0.54:  # <= 100 billion
-            return "PaLM" + "**"
-        elif row["MMLU avg"] <= 0.63:  # <= 1 trillion
-            return "PaLM" + "***"
-        else:
-            return "PaLM" + "****"
-    else:
-        return row["Architecture"]
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.countries.json b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.countries.json
new file mode 100644
index 000000000000..2c63c0851048
--- /dev/null
+++ b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.countries.json
@@ -0,0 +1,2 @@
+{
+}
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.excluded_countries.json b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.excluded_countries.json
new file mode 100644
index 000000000000..0d4f101c7a37
--- /dev/null
+++ b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.excluded_countries.json
@@ -0,0 +1,2 @@
+[
+]
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.meta.yml b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.meta.yml
new file mode 100644
index 000000000000..ff28c709bdff
--- /dev/null
+++ b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.meta.yml
@@ -0,0 +1,35 @@
+# (Inherited from meadow, remove if not different.)
+all_sources:
+  - source_testing: &source-testing
+      name: # Example: Testing Short Citation
+      published_by: # (if different to short citation). Example: Testing Full Citation
+      url: # Example: https://url_of_testing_source.com/
+      date_accessed: # Example: 2023-01-01
+      publication_date: # Example: 2023-01-01
+      publication_year: # (if publication_date is not given). Example: 2023
+      # description: Source description.
+
+# (Inherited from meadow, remove if not different.)
+dataset:
+  title: # Example: Testing Dataset Name (Institution, 2023)
+  # description: Dataset description.
+  licenses:
+    - name: # Example: Testing License Name
+      url: # Example: https://url_of_testing_source.com/license
+  sources:
+    - *source-testing
+
+tables:
+  cset:
+    # (Inherited from meadow, remove if not different.)
+    variables:
+      # testing_variable:
+      #   title: Testing variable title
+      #   unit: arbitrary units
+      #   short_unit: au
+      #   description: Full description of testing variable.
+      #   sources:
+      #     - *source-testing
+      #   display:
+      #     entityAnnotationsMap: Test annotation
+      #     numDecimalPlaces: 0
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.py b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.py
new file mode 100644
index 000000000000..ada768e24a25
--- /dev/null
+++ b/etl/steps/data/garden/artificial_intelligence/2023-07-23/cset.py
@@ -0,0 +1,38 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from typing import cast
+
+from owid.catalog import Dataset, Table
+
+from etl.data_helpers import geo
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = cast(Dataset, paths.load_dependency("cset"))
+
+    # Read table from meadow dataset.
+    tb = ds_meadow["cset"]
+
+    #
+    # Process data.
+    #
+    tb: Table = geo.harmonize_countries(
+        df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
+    )
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata)
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
diff --git a/etl/steps/data/grapher/artificial_intelligence/2023-07-23/cset.py b/etl/steps/data/grapher/artificial_intelligence/2023-07-23/cset.py
new file mode 100644
index 000000000000..182bbfc359e4
--- /dev/null
+++ b/etl/steps/data/grapher/artificial_intelligence/2023-07-23/cset.py
@@ -0,0 +1,39 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from typing import cast
+
+from owid.catalog import Dataset
+
+from etl.helpers import PathFinder, create_dataset, grapher_checks
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = cast(Dataset, paths.load_dependency("cset"))
+
+    # Read table from garden dataset.
+    tb = ds_garden["cset"]
+
+    #
+    # Process data.
+    #
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata)
+
+    #
+    # Checks.
+    #
+    grapher_checks(ds_grapher)
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/artificial_intelligence/2023-07-23/cset.py b/etl/steps/data/meadow/artificial_intelligence/2023-07-23/cset.py
new file mode 100644
index 000000000000..96f35526e237
--- /dev/null
+++ b/etl/steps/data/meadow/artificial_intelligence/2023-07-23/cset.py
@@ -0,0 +1,38 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from typing import cast
+
+import pandas as pd
+from owid.catalog import Table
+
+from etl.helpers import PathFinder, create_dataset
+from etl.snapshot import Snapshot
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap = cast(Snapshot, paths.load_dependency("cset.csv"))
+
+    # Load data from snapshot.
+    df = pd.read_csv(snap.path, low_memory=False)
+
+    #
+    # Process data.
+    #
+    # Create a new table and ensure all columns are snake-case.
+    tb = Table(df, short_name=paths.short_name, underscore=True)
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata)
+
+    # Save changes in the new garden dataset.
+    ds_meadow.save()
diff --git a/snapshots/artificial_intelligence/2023-07-12/epoch_llms.csv.dvc b/snapshots/artificial_intelligence/2023-07-12/epoch_llms.csv.dvc
index a3516f494283..e33f52e3f458 100644
--- a/snapshots/artificial_intelligence/2023-07-12/epoch_llms.csv.dvc
+++ b/snapshots/artificial_intelligence/2023-07-12/epoch_llms.csv.dvc
@@ -1,21 +1,22 @@
 meta:
-  name: Large Language Model performance and compute, EPOCH (2023)
+  name: Large Language Model performance and compute, Epoch (2023)
   publication_year: 1994
   publication_date: '2023-07-12'
-  source_name: EPOCH (2023)
+  source_name: Epoch (2023)
   source_published_by: "David Owen (2023), Extrapolating performance in language modeling
     benchmarks. Published online at epochai.org. Retrieved from: 'https://epochai.org/blog/extrapolating-performance-in-language-modelling-benchmarks'
     [online resource]"
-  url: provided directly by the source
+  url: 
+    https://docs.google.com/spreadsheets/d/1HSGbUVwGy3XLuChH_H16Keux2jmVfKT9rfDrC3uu-SQ/edit?usp=sharing
   source_data_url:
   license_url:
   license_name: Creative Commons BY 4.0
   date_accessed: 2023-07-12
   is_public: true
   description: |
-    EPOCH dataset on how performance on a MMLU language benchmark scales with computational resources.
+    Epoch dataset on how performance on a MMLU language benchmark scales with computational resources.
 wdir: ../../../data/snapshots/artificial_intelligence/2023-07-12
 outs:
-- md5: fcdc4e018fc6fc930a2227819cd45ff3
-  size: 1523
+- md5: 30e9dea32e9a2952e2f021f90290eb54
+  size: 1574
   path: epoch_llms.csv
diff --git a/snapshots/artificial_intelligence/2023-07-23/cset.csv.dvc b/snapshots/artificial_intelligence/2023-07-23/cset.csv.dvc
new file mode 100644
index 000000000000..a5945d8209e3
--- /dev/null
+++ b/snapshots/artificial_intelligence/2023-07-23/cset.csv.dvc
@@ -0,0 +1,28 @@
+meta:
+  name: 'Country Activity Tracker: Artificial Intelligence (Center for Security and
+    Emerging Technology, 2023)'
+  publication_year: 2023
+
+  publication_date: '2023-07-21'
+  source_name: Center for Security and Emerging Technology (2023)
+  source_published_by: Emerging Technology Observatory Country Activity Tracker, Artificial
+    Intelligence (Center for Security and Emerging Technology, 2023)
+  url: https://cat.eto.tech/
+  source_data_url:
+  license_url: https://eto.tech/tou/
+  license_name: Creative Commons BY 4.0
+  date_accessed: 2023-07-23
+  is_public: true
+  description: |
+    The research data in CAT (Country Attributes and Topics) is derived from ETO's Merged Academic Corpus (MAC), which contains detailed information on over 270 million scholarly articles worldwide. CAT uses only AI-related articles from the MAC. Articles are attributed to countries based on the author organizations listed in each article's metadata. An article is attributed to a country if it lists at least one author affiliated with an organization in that country.
+
+    The top ten authors for each country are identified based on the number of citations to articles they released while affiliated with institutions in that country. CAT classifies articles into AI subfields using subject assignment scores in the MAC. Articles are assigned to up to three subfields based on their scores.
+
+    CAT includes patent data from 1790 Analytics and Dimensions, and it counts AI-related patent families, including patent applications and granted patents. Patents are attributed to the country where they are filed, not necessarily the inventor's nationality. CAT also uses Crunchbase data to identify AI-related companies based on various criteria and includes investment metrics for these companies.
+
+    The data in CAT is updated at least once a quarter, with plans for more frequent updates in the future.
+wdir: ../../../data/snapshots/artificial_intelligence/2023-07-23
+outs:
+- md5: afe33f3d16404cf7527a43c5189847e8
+  size: 1658442
+  path: cset.csv
diff --git a/snapshots/artificial_intelligence/2023-07-23/cset.py b/snapshots/artificial_intelligence/2023-07-23/cset.py
new file mode 100644
index 000000000000..303e5ca84642
--- /dev/null
+++ b/snapshots/artificial_intelligence/2023-07-23/cset.py
@@ -0,0 +1,60 @@
+"""Script to create a snapshot of dataset 'Country Activity Tracker: Artificial Intelligence (Center for Security and Emerging Technology, 2023)'."""
+
+from pathlib import Path
+
+import click
+import pandas as pd
+from owid.datautils.io import df_to_file
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option(
+    "--upload/--skip-upload",
+    default=True,
+    type=bool,
+    help="Upload dataset to Snapshot",
+)
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"artificial_intelligence/{SNAPSHOT_VERSION}/cset.csv")
+    common_path = "/Users/veronikasamborska/Downloads/owid_cat_data_20230717/"
+
+    files = {
+        "companies": ["companies_yearly_disclosed.csv", "companies_yearly_estimated.csv"],
+        # "patents": ["patents_yearly_applications.csv", "patents_yearly_granted.csv"],
+        "articles": ["publications_yearly_articles.csv", "publications_yearly_citations.csv"],
+    }
+
+    all_dfs = []
+    for field, file_ids in files.items():
+        all_dfs.append(read_and_clean_data(file_ids, common_path, field))
+
+    result = pd.concat(all_dfs)
+    df_to_file(result, file_path=snap.path)
+    # Add file to DVC and upload to S3.
+    snap.dvc_add(upload=upload)
+
+
+def read_and_clean_data(file_ids, common_path, field_name):
+    all_dfs_list = []
+    for id in file_ids:
+        df_add = pd.read_csv(common_path + id)
+        if "estimated" in id:
+            df_add.rename(columns={"disclosed_investment": "disclosed_investment_estimated"}, inplace=True)
+        all_dfs_list.append(df_add)
+
+    merged_df = all_dfs_list[0]
+    for df in all_dfs_list[1:]:
+        merged_df = pd.merge(merged_df, df, on=["year", "country", "field"])
+
+    merged_df.rename(columns={"field": field_name}, inplace=True)
+    return merged_df
+
+
+if __name__ == "__main__":
+    main()