📊 AI: AI epoch adding regressions code (#3351)

owid · Oct 4, 2024 · eae9470 · eae9470
1 parent ace5fa0
commit eae9470
Show file tree

Hide file tree

Showing 15 changed files with 594 additions and 29 deletions.
diff --git a/dag/archive/artificial_intelligence.yml b/dag/archive/artificial_intelligence.yml
@@ -150,6 +150,45 @@ steps:
   data://grapher/artificial_intelligence/2024-08-05/epoch_compute_intensive_domain:
     - data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive_domain
 
+  # Main EPOCH dataset
+  data://meadow/artificial_intelligence/2024-09-09/epoch:
+    - snapshot://artificial_intelligence/2024-09-09/epoch.csv
+  data://garden/artificial_intelligence/2024-09-09/epoch:
+    - data://meadow/artificial_intelligence/2024-09-09/epoch
+  data://grapher/artificial_intelligence/2024-09-09/epoch:
+    - data://garden/artificial_intelligence/2024-09-09/epoch
+
+  # EPOCH aggregates by domain
+  data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain:
+    - data://meadow/artificial_intelligence/2024-09-09/epoch
+  data://grapher/artificial_intelligence/2024-09-09/epoch_aggregates_domain:
+    - data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain
+
+  # EPOCH aggregates by researcher affiliaiton
+  data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation:
+    - data://garden/artificial_intelligence/2024-09-09/epoch
+  data://grapher/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation:
+    - data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation
+
+ # EPOCH dataset on Compute Intensive AI
+  data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive:
+    - snapshot://artificial_intelligence/2024-09-09/epoch_compute_intensive.csv
+  data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive:
+    - data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive
+  data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive:
+    - data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive
+
+  # EPOCH dataset on Compute Intensive AI, aggregates by country
+  data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries:
+    - data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive
+  data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries:
+    - data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries
+
+  # EPOCH dataset on Compute Intensive AI, aggregates by domain
+  data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain:
+    - data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive
+  data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain:
+    - data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain
 ##############################################################################################################
 
   # AI Incidents

diff --git a/dag/artificial_intelligence.yml b/dag/artificial_intelligence.yml
@@ -1,51 +1,50 @@
 steps:
 
   # Main EPOCH dataset
-  data://meadow/artificial_intelligence/2024-09-09/epoch:
-    - snapshot://artificial_intelligence/2024-09-09/epoch.csv
-  data://garden/artificial_intelligence/2024-09-09/epoch:
-    - data://meadow/artificial_intelligence/2024-09-09/epoch
-  data://grapher/artificial_intelligence/2024-09-09/epoch:
-    - data://garden/artificial_intelligence/2024-09-09/epoch
-
   data://meadow/artificial_intelligence/2024-10-01/epoch:
     - snapshot://artificial_intelligence/2024-10-01/epoch.csv
   data://garden/artificial_intelligence/2024-10-01/epoch:
     - data://meadow/artificial_intelligence/2024-10-01/epoch
   data://grapher/artificial_intelligence/2024-10-01/epoch:
     - data://garden/artificial_intelligence/2024-10-01/epoch
 
+  # Main EPOCH dataset regression lines
+  data://garden/artificial_intelligence/2024-10-01/epoch_regressions:
+     - data://garden/artificial_intelligence/2024-10-01/epoch
+  data://grapher/artificial_intelligence/2024-10-01/epoch_regressions:
+    - data://garden/artificial_intelligence/2024-10-01/epoch_regressions
+
   # EPOCH aggregates by domain
-  data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain:
-    - data://meadow/artificial_intelligence/2024-09-09/epoch
-  data://grapher/artificial_intelligence/2024-09-09/epoch_aggregates_domain:
-    - data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain
+  data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_domain:
+    - data://meadow/artificial_intelligence/2024-10-01/epoch
+  data://grapher/artificial_intelligence/2024-10-01/epoch_aggregates_domain:
+    - data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_domain
 
   # EPOCH aggregates by researcher affiliaiton
-  data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation:
-    - data://garden/artificial_intelligence/2024-09-09/epoch
-  data://grapher/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation:
-    - data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation
+  data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation:
+    - data://garden/artificial_intelligence/2024-10-01/epoch
+  data://grapher/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation:
+    - data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation
 
  # EPOCH dataset on Compute Intensive AI
-  data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive:
-    - snapshot://artificial_intelligence/2024-09-09/epoch_compute_intensive.csv
-  data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive:
-    - data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive
-  data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive:
-    - data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive
+  data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive:
+    - snapshot://artificial_intelligence/2024-10-01/epoch_compute_intensive.csv
+  data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive:
+    - data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive
+  data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive:
+    - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive
 
   # EPOCH dataset on Compute Intensive AI, aggregates by country
-  data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries:
-    - data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive
-  data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries:
-    - data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries
+  data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_countries:
+    - data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive
+  data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive_countries:
+    - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_countries
 
   # EPOCH dataset on Compute Intensive AI, aggregates by domain
-  data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain:
-    - data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive
-  data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain:
-    - data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain
+  data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain:
+    - data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive
+  data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain:
+    - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain
 
   # Large Language Models and Compute (EPOCH)
   data://garden/artificial_intelligence/2024-02-15/epoch_llms:

diff --git a/etl/steps/data/garden/artificial_intelligence/2024-10-01/epoch_regressions.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-10-01/epoch_regressions.meta.yml
@@ -0,0 +1,12 @@
+
+definitions:
+  common:
+    processing_level: major
+    presentation:
+      topic_tags:
+        - Artificial Intelligence
+    description_processing: |-
+      We performed a regression analysis, fitting exponential models to the data for both the pre-deep learning (before 2010) and deep learning eras (after 2010), using the code provided by researchers from Epoch.
+dataset:
+  title: Parameter, Compute and Data Trends in Machine Learning - Regressions
+
diff --git a/etl/steps/data/garden/artificial_intelligence/2024-10-01/epoch_regressions.py b/etl/steps/data/garden/artificial_intelligence/2024-10-01/epoch_regressions.py
@@ -0,0 +1,148 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+import numpy as np
+import owid.catalog.processing as pr
+import pandas as pd
+from owid.catalog import Table
+from sklearn.linear_model import LinearRegression
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+DL_ERA_START = 2010
+START_DATE = 1950
+END_DATE = 2025.2
+
+
+def run(dest_dir: str) -> None:
+    paths.log.info("epoch.start")
+
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("epoch")
+
+    # Read table from meadow dataset.
+    tb = ds_meadow["epoch"].reset_index()
+
+    # Run regression analysis and concatenate results
+    tb_trend = run_regression(tb)
+    tb = tb.drop("frac_year", axis=1)
+    tb = pr.concat([tb_trend, tb])
+
+    # Format the table
+    tb = tb.format(["days_since_1949", "system"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata)
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
+
+    paths.log.info("epoch.end")
+
+
+def fit_exponential(models, metric):
+    """Fit an exponential model to the given metric data. Code provided by Epoch AI team."""
+    x = models["frac_year"].values.reshape(-1, 1)
+    y = models[metric]
+
+    # Filter out non-positive values
+    positive_mask = y > 0
+    x = x[positive_mask]
+    y = y[positive_mask]
+
+    # Apply log10 transformation
+    y = np.log10(y)
+
+    # Filter out infinite and extremely large values
+    finite_mask = np.isfinite(y) & (y < np.finfo(np.float32).max)
+    x = x[finite_mask]
+    y = y[finite_mask]
+
+    # Fit linear regression model
+    reg = LinearRegression().fit(x, y)
+    return reg.intercept_, reg.coef_[0]
+
+
+def run_regression(tb):
+    """Run regression analysis on the given table and return the updated table."""
+    publication_dates = tb["publication_date"]
+    tb.loc[:, "frac_year"] = (
+        publication_dates.dt.year + (publication_dates.dt.month - 1) / 12 + (publication_dates.dt.day - 1) / 365
+    )
+    tb = tb.sort_values(by="frac_year")
+
+    metrics = ["training_computation_petaflop", "parameters", "training_dataset_size__datapoints"]
+    new_tables = []
+
+    for m, metric in enumerate(metrics):
+        # Filter out models without the metric information
+        tb_metric = tb[pd.notnull(tb[metric])]
+
+        # Fit exponential models for pre-DL and DL eras
+        pre_dl_models = tb_metric[tb_metric["frac_year"] < DL_ERA_START]
+        pre_dl_fit = fit_exponential(pre_dl_models, metric)
+        pre_dl_oom_per_year = pre_dl_fit[1]
+
+        dl_models = tb_metric[tb_metric["frac_year"] >= DL_ERA_START]
+        dl_fit = fit_exponential(dl_models, metric)
+        dl_oom_per_year = dl_fit[1]
+
+        # Log the results
+        pre_dl_info = f"{10**pre_dl_oom_per_year:.1f}x/year"
+        dl_info = f"{10**dl_oom_per_year:.1f}x/year"
+        paths.log.info(f"Pre Deep Learning Era ({metric}): {pre_dl_info}")
+        paths.log.info(f"Deep Learning Era ({metric}): {dl_info}")
+
+        # Define the year grids for the periods 1950 to 2010 and 2010 to 2025 with just two points
+        pre_dl_year_grid = np.array([START_DATE, DL_ERA_START])
+        dl_year_grid = np.array([DL_ERA_START, END_DATE])
+
+        # Calculate the lines for each period using the fitted exponential models
+        pre_dl_line = 10 ** (pre_dl_fit[0] + pre_dl_year_grid * pre_dl_fit[1])
+        dl_line = 10 ** (dl_fit[0] + dl_year_grid * dl_fit[1])
+
+        # Create new DataFrames for pre-deep learning and deep learning era trends with only necessary columns
+        pre_dl_df = pd.DataFrame(
+            {
+                "days_since_1949": [
+                    tb_metric["days_since_1949"].min(),
+                    tb_metric[tb_metric["frac_year"] < DL_ERA_START]["days_since_1949"].max(),
+                ],
+                f"{metric}": [pre_dl_line[0], pre_dl_line[-1]],
+                "system": [f"{pre_dl_info}"] * 2,
+            }
+        )
+
+        dl_df = pd.DataFrame(
+            {
+                "days_since_1949": [
+                    tb_metric[tb_metric["frac_year"] >= DL_ERA_START]["days_since_1949"].min(),
+                    tb_metric["days_since_1949"].max(),
+                ],
+                f"{metric}": [dl_line[0], dl_line[-1]],
+                "system": [f"{dl_info}"] * 2,
+            }
+        )
+
+        # Combine the pre-deep learning and deep learning era DataFrames
+        df_combined = pd.concat([pre_dl_df, dl_df], ignore_index=True)
+        new_tables.append(df_combined)
+
+    # Merge all the new DataFrames
+    tb_new = new_tables[0]
+    for tb_m in new_tables[1:]:
+        tb_new = pd.merge(tb_new, tb_m, on=["system", "days_since_1949"], how="outer")
+
+    # Convert to OWID Table and add metadata
+    tb_new = Table(tb_new, short_name=paths.short_name)
+    for column in tb_new.columns:
+        tb_new[column].metadata.origins = tb["publication_date"].metadata.origins
+
+    return tb_new
diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation.py b/etl/steps/data/grapher/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation.py
@@ -0,0 +1,41 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset, grapher_checks
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("epoch_aggregates_affiliation")
+
+    # Read table from garden dataset.
+    tb = ds_garden["epoch_aggregates_affiliation"]
+
+    #
+    # Process data.
+    #
+    # Rename for plotting research affiliation as country in grapher
+    tb = tb.rename_index_names(
+        {
+            "organization_categorization": "country",
+        }
+    )
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata)
+
+    #
+    # Checks.
+    #
+    grapher_checks(ds_grapher)
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-10-01/epoch_aggregates_countries.py b/etl/steps/data/grapher/artificial_intelligence/2024-10-01/epoch_aggregates_countries.py
@@ -0,0 +1,30 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset, grapher_checks
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("epoch_aggregates_countries")
+
+    # Read table from garden dataset.
+    tb_garden = ds_garden["epoch_aggregates_countries"]
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata)
+
+    #
+    # Checks.
+    #
+    grapher_checks(ds_grapher)
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()