Skip to content

Commit

Permalink
📊 AI: AI epoch adding regressions code (#3351)
Browse files Browse the repository at this point in the history
  • Loading branch information
veronikasamborska1994 authored Oct 4, 2024
1 parent ace5fa0 commit eae9470
Show file tree
Hide file tree
Showing 15 changed files with 594 additions and 29 deletions.
39 changes: 39 additions & 0 deletions dag/archive/artificial_intelligence.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,45 @@ steps:
data://grapher/artificial_intelligence/2024-08-05/epoch_compute_intensive_domain:
- data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive_domain

# Main EPOCH dataset
data://meadow/artificial_intelligence/2024-09-09/epoch:
- snapshot://artificial_intelligence/2024-09-09/epoch.csv
data://garden/artificial_intelligence/2024-09-09/epoch:
- data://meadow/artificial_intelligence/2024-09-09/epoch
data://grapher/artificial_intelligence/2024-09-09/epoch:
- data://garden/artificial_intelligence/2024-09-09/epoch

# EPOCH aggregates by domain
data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain:
- data://meadow/artificial_intelligence/2024-09-09/epoch
data://grapher/artificial_intelligence/2024-09-09/epoch_aggregates_domain:
- data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain

# EPOCH aggregates by researcher affiliaiton
data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation:
- data://garden/artificial_intelligence/2024-09-09/epoch
data://grapher/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation:
- data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation

# EPOCH dataset on Compute Intensive AI
data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive:
- snapshot://artificial_intelligence/2024-09-09/epoch_compute_intensive.csv
data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive:
- data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive
data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive:
- data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive

# EPOCH dataset on Compute Intensive AI, aggregates by country
data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries:
- data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive
data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries:
- data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries

# EPOCH dataset on Compute Intensive AI, aggregates by domain
data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain:
- data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive
data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain:
- data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain
##############################################################################################################

# AI Incidents
Expand Down
57 changes: 28 additions & 29 deletions dag/artificial_intelligence.yml
Original file line number Diff line number Diff line change
@@ -1,51 +1,50 @@
steps:

# Main EPOCH dataset
data://meadow/artificial_intelligence/2024-09-09/epoch:
- snapshot://artificial_intelligence/2024-09-09/epoch.csv
data://garden/artificial_intelligence/2024-09-09/epoch:
- data://meadow/artificial_intelligence/2024-09-09/epoch
data://grapher/artificial_intelligence/2024-09-09/epoch:
- data://garden/artificial_intelligence/2024-09-09/epoch

data://meadow/artificial_intelligence/2024-10-01/epoch:
- snapshot://artificial_intelligence/2024-10-01/epoch.csv
data://garden/artificial_intelligence/2024-10-01/epoch:
- data://meadow/artificial_intelligence/2024-10-01/epoch
data://grapher/artificial_intelligence/2024-10-01/epoch:
- data://garden/artificial_intelligence/2024-10-01/epoch

# Main EPOCH dataset regression lines
data://garden/artificial_intelligence/2024-10-01/epoch_regressions:
- data://garden/artificial_intelligence/2024-10-01/epoch
data://grapher/artificial_intelligence/2024-10-01/epoch_regressions:
- data://garden/artificial_intelligence/2024-10-01/epoch_regressions

# EPOCH aggregates by domain
data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain:
- data://meadow/artificial_intelligence/2024-09-09/epoch
data://grapher/artificial_intelligence/2024-09-09/epoch_aggregates_domain:
- data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain
data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_domain:
- data://meadow/artificial_intelligence/2024-10-01/epoch
data://grapher/artificial_intelligence/2024-10-01/epoch_aggregates_domain:
- data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_domain

# EPOCH aggregates by researcher affiliaiton
data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation:
- data://garden/artificial_intelligence/2024-09-09/epoch
data://grapher/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation:
- data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation
data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation:
- data://garden/artificial_intelligence/2024-10-01/epoch
data://grapher/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation:
- data://garden/artificial_intelligence/2024-10-01/epoch_aggregates_affiliation

# EPOCH dataset on Compute Intensive AI
data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive:
- snapshot://artificial_intelligence/2024-09-09/epoch_compute_intensive.csv
data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive:
- data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive
data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive:
- data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive
data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive:
- snapshot://artificial_intelligence/2024-10-01/epoch_compute_intensive.csv
data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive:
- data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive
data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive:
- data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive

# EPOCH dataset on Compute Intensive AI, aggregates by country
data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries:
- data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive
data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries:
- data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries
data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_countries:
- data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive
data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive_countries:
- data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_countries

# EPOCH dataset on Compute Intensive AI, aggregates by domain
data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain:
- data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive
data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain:
- data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain
data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain:
- data://meadow/artificial_intelligence/2024-10-01/epoch_compute_intensive
data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain:
- data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain

# Large Language Models and Compute (EPOCH)
data://garden/artificial_intelligence/2024-02-15/epoch_llms:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

definitions:
common:
processing_level: major
presentation:
topic_tags:
- Artificial Intelligence
description_processing: |-
We performed a regression analysis, fitting exponential models to the data for both the pre-deep learning (before 2010) and deep learning eras (after 2010), using the code provided by researchers from Epoch.
dataset:
title: Parameter, Compute and Data Trends in Machine Learning - Regressions

Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""Load a meadow dataset and create a garden dataset."""

import numpy as np
import owid.catalog.processing as pr
import pandas as pd
from owid.catalog import Table
from sklearn.linear_model import LinearRegression

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)
DL_ERA_START = 2010
START_DATE = 1950
END_DATE = 2025.2


def run(dest_dir: str) -> None:
paths.log.info("epoch.start")

#
# Load inputs.
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("epoch")

# Read table from meadow dataset.
tb = ds_meadow["epoch"].reset_index()

# Run regression analysis and concatenate results
tb_trend = run_regression(tb)
tb = tb.drop("frac_year", axis=1)
tb = pr.concat([tb_trend, tb])

# Format the table
tb = tb.format(["days_since_1949", "system"])

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata)

# Save changes in the new garden dataset.
ds_garden.save()

paths.log.info("epoch.end")


def fit_exponential(models, metric):
"""Fit an exponential model to the given metric data. Code provided by Epoch AI team."""
x = models["frac_year"].values.reshape(-1, 1)
y = models[metric]

# Filter out non-positive values
positive_mask = y > 0
x = x[positive_mask]
y = y[positive_mask]

# Apply log10 transformation
y = np.log10(y)

# Filter out infinite and extremely large values
finite_mask = np.isfinite(y) & (y < np.finfo(np.float32).max)
x = x[finite_mask]
y = y[finite_mask]

# Fit linear regression model
reg = LinearRegression().fit(x, y)
return reg.intercept_, reg.coef_[0]


def run_regression(tb):
"""Run regression analysis on the given table and return the updated table."""
publication_dates = tb["publication_date"]
tb.loc[:, "frac_year"] = (
publication_dates.dt.year + (publication_dates.dt.month - 1) / 12 + (publication_dates.dt.day - 1) / 365
)
tb = tb.sort_values(by="frac_year")

metrics = ["training_computation_petaflop", "parameters", "training_dataset_size__datapoints"]
new_tables = []

for m, metric in enumerate(metrics):
# Filter out models without the metric information
tb_metric = tb[pd.notnull(tb[metric])]

# Fit exponential models for pre-DL and DL eras
pre_dl_models = tb_metric[tb_metric["frac_year"] < DL_ERA_START]
pre_dl_fit = fit_exponential(pre_dl_models, metric)
pre_dl_oom_per_year = pre_dl_fit[1]

dl_models = tb_metric[tb_metric["frac_year"] >= DL_ERA_START]
dl_fit = fit_exponential(dl_models, metric)
dl_oom_per_year = dl_fit[1]

# Log the results
pre_dl_info = f"{10**pre_dl_oom_per_year:.1f}x/year"
dl_info = f"{10**dl_oom_per_year:.1f}x/year"
paths.log.info(f"Pre Deep Learning Era ({metric}): {pre_dl_info}")
paths.log.info(f"Deep Learning Era ({metric}): {dl_info}")

# Define the year grids for the periods 1950 to 2010 and 2010 to 2025 with just two points
pre_dl_year_grid = np.array([START_DATE, DL_ERA_START])
dl_year_grid = np.array([DL_ERA_START, END_DATE])

# Calculate the lines for each period using the fitted exponential models
pre_dl_line = 10 ** (pre_dl_fit[0] + pre_dl_year_grid * pre_dl_fit[1])
dl_line = 10 ** (dl_fit[0] + dl_year_grid * dl_fit[1])

# Create new DataFrames for pre-deep learning and deep learning era trends with only necessary columns
pre_dl_df = pd.DataFrame(
{
"days_since_1949": [
tb_metric["days_since_1949"].min(),
tb_metric[tb_metric["frac_year"] < DL_ERA_START]["days_since_1949"].max(),
],
f"{metric}": [pre_dl_line[0], pre_dl_line[-1]],
"system": [f"{pre_dl_info}"] * 2,
}
)

dl_df = pd.DataFrame(
{
"days_since_1949": [
tb_metric[tb_metric["frac_year"] >= DL_ERA_START]["days_since_1949"].min(),
tb_metric["days_since_1949"].max(),
],
f"{metric}": [dl_line[0], dl_line[-1]],
"system": [f"{dl_info}"] * 2,
}
)

# Combine the pre-deep learning and deep learning era DataFrames
df_combined = pd.concat([pre_dl_df, dl_df], ignore_index=True)
new_tables.append(df_combined)

# Merge all the new DataFrames
tb_new = new_tables[0]
for tb_m in new_tables[1:]:
tb_new = pd.merge(tb_new, tb_m, on=["system", "days_since_1949"], how="outer")

# Convert to OWID Table and add metadata
tb_new = Table(tb_new, short_name=paths.short_name)
for column in tb_new.columns:
tb_new[column].metadata.origins = tb["publication_date"].metadata.origins

return tb_new
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Load a garden dataset and create a grapher dataset."""

from etl.helpers import PathFinder, create_dataset, grapher_checks

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load garden dataset.
ds_garden = paths.load_dataset("epoch_aggregates_affiliation")

# Read table from garden dataset.
tb = ds_garden["epoch_aggregates_affiliation"]

#
# Process data.
#
# Rename for plotting research affiliation as country in grapher
tb = tb.rename_index_names(
{
"organization_categorization": "country",
}
)

#
# Save outputs.
#
# Create a new grapher dataset with the same metadata as the garden dataset.
ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata)

#
# Checks.
#
grapher_checks(ds_grapher)

# Save changes in the new grapher dataset.
ds_grapher.save()
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Load a garden dataset and create a grapher dataset."""

from etl.helpers import PathFinder, create_dataset, grapher_checks

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load garden dataset.
ds_garden = paths.load_dataset("epoch_aggregates_countries")

# Read table from garden dataset.
tb_garden = ds_garden["epoch_aggregates_countries"]
#
# Save outputs.
#
# Create a new grapher dataset with the same metadata as the garden dataset.
ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata)

#
# Checks.
#
grapher_checks(ds_grapher)

# Save changes in the new grapher dataset.
ds_grapher.save()
Loading

0 comments on commit eae9470

Please sign in to comment.