From d5608920d2466a605c3837ffbfc5418a00f57ea7 Mon Sep 17 00:00:00 2001 From: veronikasamborska1994 <32176660+veronikasamborska1994@users.noreply.github.com> Date: Mon, 9 Sep 2024 16:07:02 +0200 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=93=8A=20ai:=20Epoch=20AI=20september?= =?UTF-8?q?=20update?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dag/archive/artificial_intelligence.yml | 39 +++++ dag/artificial_intelligence.yml | 56 +++---- .../2024-09-09/epoch.meta.yml | 98 ++++++++++++ .../2024-09-09/epoch.py | 144 ++++++++++++++++++ .../epoch_aggregates_affiliation.meta.yml | 35 +++++ .../epoch_aggregates_affiliation.py | 75 +++++++++ .../epoch_aggregates_domain.meta.yml | 53 +++++++ .../2024-09-09/epoch_aggregates_domain.py | 99 ++++++++++++ .../epoch_compute_intensive.meta.yml | 91 +++++++++++ .../2024-09-09/epoch_compute_intensive.py | 60 ++++++++ ...compute_intensive_countries.countries.json | 18 +++ ...epoch_compute_intensive_countries.meta.yml | 31 ++++ .../epoch_compute_intensive_countries.py | 59 +++++++ .../epoch_compute_intensive_domain.meta.yml | 48 ++++++ .../epoch_compute_intensive_domain.py | 52 +++++++ .../2024-09-09/shared.py | 74 +++++++++ .../2024-09-09/epoch.meta.yml | 18 +++ .../2024-09-09/epoch.py | 89 +++++++++++ .../epoch_aggregates_affiliation.py | 41 +++++ .../2024-09-09/epoch_aggregates_countries.py | 30 ++++ .../2024-09-09/epoch_aggregates_domain.py | 39 +++++ .../epoch_aggregates_organizations.py | 38 +++++ .../2024-09-09/epoch_compute_intensive.py | 33 ++++ .../epoch_compute_intensive_countries.py | 30 ++++ .../epoch_compute_intensive_domain.py | 39 +++++ .../epoch_compute_intensive_organizations.py | 38 +++++ .../2024-09-09/epoch.py | 73 +++++++++ .../2024-09-09/epoch_compute_intensive.py | 66 ++++++++ .../2024-09-09/epoch.csv.dvc | 37 +++++ .../2024-09-09/epoch.py | 33 ++++ .../epoch_compute_intensive.csv.dvc | 32 ++++ .../2024-09-09/epoch_compute_intensive.py | 24 +++ 32 files changed, 1664 insertions(+), 28 deletions(-) create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch.meta.yml create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch.py create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation.meta.yml create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation.py create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain.meta.yml create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain.py create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive.meta.yml create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive.py create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.countries.json create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.meta.yml create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.py create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain.meta.yml create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain.py create mode 100644 etl/steps/data/garden/artificial_intelligence/2024-09-09/shared.py create mode 100644 etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.meta.yml create mode 100644 etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.py create mode 100644 etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation.py create mode 100644 etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_countries.py create mode 100644 etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_domain.py create mode 100644 etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_organizations.py create mode 100644 etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive.py create mode 100644 etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.py create mode 100644 etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain.py create mode 100644 etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_organizations.py create mode 100644 etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch.py create mode 100644 etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive.py create mode 100644 snapshots/artificial_intelligence/2024-09-09/epoch.csv.dvc create mode 100644 snapshots/artificial_intelligence/2024-09-09/epoch.py create mode 100644 snapshots/artificial_intelligence/2024-09-09/epoch_compute_intensive.csv.dvc create mode 100644 snapshots/artificial_intelligence/2024-09-09/epoch_compute_intensive.py diff --git a/dag/archive/artificial_intelligence.yml b/dag/archive/artificial_intelligence.yml index b1f373497b5..494e0c45457 100644 --- a/dag/archive/artificial_intelligence.yml +++ b/dag/archive/artificial_intelligence.yml @@ -110,6 +110,45 @@ steps: data://grapher/artificial_intelligence/2024-06-19/epoch_compute_intensive: - data://garden/artificial_intelligence/2024-06-19/epoch_compute_intensive + # Main EPOCH dataset + data://meadow/artificial_intelligence/2024-08-05/epoch: + - snapshot://artificial_intelligence/2024-08-05/epoch.csv + data://garden/artificial_intelligence/2024-08-05/epoch: + - data://meadow/artificial_intelligence/2024-08-05/epoch + data://grapher/artificial_intelligence/2024-08-05/epoch: + - data://garden/artificial_intelligence/2024-08-05/epoch + + # EPOCH aggregates by domain + data://garden/artificial_intelligence/2024-08-05/epoch_aggregates_domain: + - data://meadow/artificial_intelligence/2024-08-05/epoch + data://grapher/artificial_intelligence/2024-08-05/epoch_aggregates_domain: + - data://garden/artificial_intelligence/2024-08-05/epoch_aggregates_domain + + # EPOCH aggregates by researcher affiliaiton + data://garden/artificial_intelligence/2024-08-05/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-08-05/epoch + data://grapher/artificial_intelligence/2024-08-05/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-08-05/epoch_aggregates_affiliation + + # EPOCH dataset on Compute Intensive AI + data://meadow/artificial_intelligence/2024-08-05/epoch_compute_intensive: + - snapshot://artificial_intelligence/2024-08-05/epoch_compute_intensive.csv + data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive: + - data://meadow/artificial_intelligence/2024-08-05/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-08-05/epoch_compute_intensive: + - data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive + + # EPOCH dataset on Compute Intensive AI, aggregates by country + data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive_countries: + - data://meadow/artificial_intelligence/2024-08-05/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-08-05/epoch_compute_intensive_countries: + - data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive_countries + + # EPOCH dataset on Compute Intensive AI, aggregates by domain + data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive_domain: + - data://meadow/artificial_intelligence/2024-08-05/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-08-05/epoch_compute_intensive_domain: + - data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive_domain ############################################################################################################## diff --git a/dag/artificial_intelligence.yml b/dag/artificial_intelligence.yml index d47ef1d4016..34dfb7fb666 100644 --- a/dag/artificial_intelligence.yml +++ b/dag/artificial_intelligence.yml @@ -1,44 +1,44 @@ steps: # Main EPOCH dataset - data://meadow/artificial_intelligence/2024-08-05/epoch: - - snapshot://artificial_intelligence/2024-08-05/epoch.csv - data://garden/artificial_intelligence/2024-08-05/epoch: - - data://meadow/artificial_intelligence/2024-08-05/epoch - data://grapher/artificial_intelligence/2024-08-05/epoch: - - data://garden/artificial_intelligence/2024-08-05/epoch + data://meadow/artificial_intelligence/2024-09-09/epoch: + - snapshot://artificial_intelligence/2024-09-09/epoch.csv + data://garden/artificial_intelligence/2024-09-09/epoch: + - data://meadow/artificial_intelligence/2024-09-09/epoch + data://grapher/artificial_intelligence/2024-09-09/epoch: + - data://garden/artificial_intelligence/2024-09-09/epoch # EPOCH aggregates by domain - data://garden/artificial_intelligence/2024-08-05/epoch_aggregates_domain: - - data://meadow/artificial_intelligence/2024-08-05/epoch - data://grapher/artificial_intelligence/2024-08-05/epoch_aggregates_domain: - - data://garden/artificial_intelligence/2024-08-05/epoch_aggregates_domain + data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain: + - data://meadow/artificial_intelligence/2024-09-09/epoch + data://grapher/artificial_intelligence/2024-09-09/epoch_aggregates_domain: + - data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain # EPOCH aggregates by researcher affiliaiton - data://garden/artificial_intelligence/2024-08-05/epoch_aggregates_affiliation: - - data://garden/artificial_intelligence/2024-08-05/epoch - data://grapher/artificial_intelligence/2024-08-05/epoch_aggregates_affiliation: - - data://garden/artificial_intelligence/2024-08-05/epoch_aggregates_affiliation + data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-09-09/epoch + data://grapher/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation # EPOCH dataset on Compute Intensive AI - data://meadow/artificial_intelligence/2024-08-05/epoch_compute_intensive: - - snapshot://artificial_intelligence/2024-08-05/epoch_compute_intensive.csv - data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive: - - data://meadow/artificial_intelligence/2024-08-05/epoch_compute_intensive - data://grapher/artificial_intelligence/2024-08-05/epoch_compute_intensive: - - data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive + data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive: + - snapshot://artificial_intelligence/2024-09-09/epoch_compute_intensive.csv + data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive: + - data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive: + - data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive # EPOCH dataset on Compute Intensive AI, aggregates by country - data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive_countries: - - data://meadow/artificial_intelligence/2024-08-05/epoch_compute_intensive - data://grapher/artificial_intelligence/2024-08-05/epoch_compute_intensive_countries: - - data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive_countries + data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries: + - data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries: + - data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries # EPOCH dataset on Compute Intensive AI, aggregates by domain - data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive_domain: - - data://meadow/artificial_intelligence/2024-08-05/epoch_compute_intensive - data://grapher/artificial_intelligence/2024-08-05/epoch_compute_intensive_domain: - - data://garden/artificial_intelligence/2024-08-05/epoch_compute_intensive_domain + data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain: + - data://meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain: + - data://garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain # Large Language Models and Compute (EPOCH) data://garden/artificial_intelligence/2024-02-15/epoch_llms: diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch.meta.yml new file mode 100644 index 00000000000..f8d02877235 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch.meta.yml @@ -0,0 +1,98 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch: + variables: + domain: + title: Domain + unit: '' + short_unit: '' + description_short: Refers to the specific area, application, or field in which an AI system is designed to operate. + description_processing: |- + In cases where multiple domains were associated with a system, we consolidated these entries under the label "Multiple domains". We also identified domains associated with fewer than 20 notable systems and grouped these under the category 'Other'. + display: + zeroDay: '1949-01-01' + yearIsDay: true + + organization_categorization: + title: Researcher affiliation + unit: '' + short_unit: '' + description_short: Describes the sector where the authors of an AI system have their primary affiliations. + description_from_producer: |- + Systems are categorized as “Industry” if their authors are affiliated with private sector organizations, “Academia” if the authors are affiliated with universities or academic institutions, or “Industry - Academia Collaboration” when at least 30% of the authors are from each. + + parameters: + title: Number of parameters + unit: '' + description_short: Total number of learnable variables or weights that the model contains. Parameters are adjusted during the training process to optimize the model's performance. + description_key: + - Parameters are internal variables that machine learning models adjust during their training process to improve their ability to make accurate predictions. They act as the model's "knobs" that are fine-tuned based on the provided data. In deep learning, a subset of artificial intelligence (AI), parameters primarily consist of the weights assigned to the connections between the small processing units called neurons. Picture a vast network of interconnected neurons where the strength of each connection represents a parameter. + + - The total number of parameters in a model is influenced by various factors. The model's structure and the number of “layers” of neurons play a significant role. Generally, more complex models with additional layers tend to have a higher number of parameters. Special components of specific deep learning architectures can further contribute to the overall parameter count. + + - Understanding the number of parameters in a model is crucial to design effective models. More parameters can help the model understand complex data patterns, potentially leading to higher accuracy. However, there's a fine balance to strike. If a model has too many parameters, it risks memorizing the specific examples in its training data rather than learning their underlying patterns. Consequently, it may perform poorly when presented with new, unseen data. Achieving the right balance of parameters is a critical consideration in model development. + + - In recent times, the AI community has witnessed the emergence of what are often referred to as "giant models." These models boast an astounding number of parameters, reaching into the billions or even trillions. While these huge models have achieved remarkable performance, they have a significant computational cost. Effectively managing and training such large-scale models has become a prominent and active area of research and discussion within the AI field. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_dataset_size__datapoints: + title: Training dataset size + unit: 'datapoints' + description_short: The number of examples provided to train an AI model. Typically, more data results in a more comprehensive understanding by the model. + description_key: + - Training data size refers to the volume of data employed to train an artificial intelligence (AI) model effectively. It's a representation of the number of examples that the model learns from during its training process. It is a fundamental measure of the scope of the data used in the model's learning phase. + + - To grasp the concept of training data size, imagine teaching a friend the art of distinguishing different types of birds. In this analogy, each bird picture presented to your friend corresponds to an individual piece of training data. If you showed them 100 unique bird photos, then the training data size in this scenario would be quantified as 100. + + - Training data size is an essential indicator in AI and machine learning. First and foremost, it directly impacts the depth of learning achieved by the model. The more extensive the dataset, the more profound and comprehensive the model's understanding of the subject matter becomes. Additionally, a large training data size contributes significantly to improved recognition capabilities. By exposing the model to a diverse array of examples, it becomes adept at identifying subtle nuances, much like how it becomes skilled at distinguishing various bird species through exposure to a large variety of bird images. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_computation_petaflop: + title: Training computation (petaFLOP) + unit: 'petaFLOP' + description_short: Computation is measured in total petaFLOP, which is 10¹⁵ [floating-point operations](#dod:flop) estimated from AI literature, albeit with some uncertainty. + description_key: + - In the context of artificial intelligence (AI), training computation is predominantly measured using floating-point operations or “FLOP”. One FLOP represents a single arithmetic operation involving floating-point numbers, such as addition, subtraction, multiplication, or division. To adapt to the vast computational demands of AI systems, the measurement unit of petaFLOP is commonly used. One petaFLOP stands as a staggering one quadrillion FLOPs, underscoring the magnitude of computational operations within AI. + + - Modern AI systems are rooted in machine learning and deep learning techniques. These methodologies are notorious for their computational intensity, involving complex mathematical processes and algorithms. During the training phase, AI models process large volumes of data, while continuously adapting and refining their parameters to optimize performance, rendering the training process computationally intensive. + + - Many factors influence the magnitude of training computation within AI systems. Notably, the size of the dataset employed for training significantly impacts the computational load. Larger datasets necessitate more processing power. The complexity of the model's architecture also plays a pivotal role; more intricate models lead to more computations. Parallel processing, involving the simultaneous use of multiple processors, also has a substantial effect. Beyond these factors, specific design choices and other variables further contribute to the complexity and scale of training computation within AI. + + description_processing: Training computation was converted from its original measurement in FLOPs (floating-point operations) to a more manageable unit known as petaFLOPs. This conversion is performed by dividing the original training compute value by 1e15, which represents one quadrillion (10^15). The purpose of this conversion is to provide a more human-readable and practical representation of the immense computational efforts involved in training AI systems. By expressing the training computation in petaFLOPs, it becomes easier to grasp the scale and magnitude of the computational resources required for training these systems, especially when dealing with large datasets and complex architectures. + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + presentation: + grapher_config: + title: Training computation + + publication_date: + title: Publication date + unit: '' + description_short: The date when the AI system was first published. + description_from_producer: The publication, announcement, or release date of the model, in YYYY-MM-DD format. If the year and month are known but the day is unknown, the day is filled in as YYYY-MM-15. If the year is known but the month and day are unknown, the month and day are filled in as YYYY-07-01. + + + diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch.py b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch.py new file mode 100644 index 00000000000..764785f5ced --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch.py @@ -0,0 +1,144 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_meadow["epoch"] + tb = tb.reset_index() + + # + # Process data. + # + # Filter notable systems by selecting rows where 'notability_criteria' is not nan + tb = tb[tb["notability_criteria"].notna()].reset_index(drop=True) + tb = tb.drop("notability_criteria", axis=1) + + # Convert relevant columns to string type + columns = ["system", "domain", "organization_categorization"] + tb[columns] = tb[columns].astype(str) + + def simplify_entry(entry): + """ + Simplifies an entry of organization categories which can include many entries of Industry, Academia etc. + Removes duplicates, ensures all words except the first one start with a lower case letter,and joins the categories with ", " and " and " before the last one. + """ + # Check for "nan" + if entry == "nan": + return "Not specified" + + # Split the entry into categories, convert to set to remove duplicates + categories = sorted(set(entry.split(","))) + + # Make sure all words except the first one start with a lower case letter + categories = [categories[0]] + [category.lower() for category in categories[1:]] + + # Join the categories with ", " and " and " before the last one + if len(categories) > 1: + simplified_entry = ", ".join(categories[:-1]) + " and " + categories[-1] + " collaboration" + else: + simplified_entry = categories[0] + + return simplified_entry + + tb["organization_categorization"] = tb["organization_categorization"].apply(simplify_entry) + + # Get the unique values in the organization_categorization column and compare them to expected affiliations + unique_values = set(tb["organization_categorization"]) + expected_values = { + "Industry", + "Academia", + "Government", + "Academia and industry collaboration", + "Academia and research collective collaboration", + "Industry and research collective collaboration", + "Academia, industry and research collective collaboration", + "Government and industry collaboration", + "Research collective", + "Academia, government and industry collaboration", + "Academia and government collaboration", + "Academia, government, industry and research collective collaboration", + "Not specified", + } + assert unique_values == expected_values, "Unexpected affiliations in organization_categorization column" + + # Replace affiliation of researchers with less than 20 systems with 'Other' + affiliation_counts = tb["organization_categorization"].value_counts() + + tb["organization_categorization"] = tb["organization_categorization"].where( + tb["organization_categorization"].map(affiliation_counts) >= 20, "Other" + ) + # Get the organizations that were reclassified to 'Other' + reclassified_organizations = affiliation_counts[affiliation_counts < 20].index.tolist() + + paths.log.info( + f"Affiliations of researchers with less than 20 notable systems that were reclassified to 'Other': {', '.join(reclassified_organizations)}" + ) + + # Replace nans with Unspecified in each column to avoid issues when calculating sume of notable systems + columns = ["organization_categorization", "domain", "organization"] + tb[columns] = tb[columns].replace("nan", "Not specified") + + # Check for multiple entries in 'domain' separated by comma + multiple_domains = tb["domain"].str.contains(",") + # Replace entries in 'domain' that contain a comma with 'Multiple Domains' + tb.loc[multiple_domains, "domain"] = "Multiple domains" + + # Replace domains with less than 20 systems with 'Other' + domain_counts = tb["domain"].value_counts() + + tb["domain"] = tb["domain"].where(tb["domain"].map(domain_counts) >= 20, "Other") + # Get the domains that were reclassified to 'Other' + reclassified_domains = domain_counts[domain_counts < 20].index.tolist() + + paths.log.info( + f"Domains with less than 20 notable systems that were reclassified to 'Other': {', '.join(reclassified_domains)}" + ) + # Convert FLOP to petaFLOP and remove the column with FLOPs (along with training time in hours) + tb["training_computation_petaflop"] = tb["training_compute__flop"] / 1e15 + + # Convert publication date to a datetime objects + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + + # Calculate 'days_since_1949' + tb["days_since_1949"] = (tb["publication_date"] - pd.to_datetime("1949-01-01")).dt.days.astype("Int64") + tb = tb.dropna(subset=["days_since_1949"]) + + tb = tb.reset_index(drop=True) + + assert not tb[["system", "days_since_1949"]].isnull().any().any(), "Index columns should not have NaN values" + + # Drop columns that are not needed + tb = tb.drop( + ["training_compute__flop", "organization", "authors", "country__from_organization"], + axis=1, + ) + tb = tb.format(["days_since_1949", "system"]) + + # Add metadata to the publication date column + tb["publication_date"].metadata.origins = tb["domain"].metadata.origins + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation.meta.yml new file mode 100644 index 00000000000..1bf9422d84a --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation.meta.yml @@ -0,0 +1,35 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + description_short: Describes the sector where the authors of a notable AI system have their primary affiliations. {definitions.desc_update} + description_from_producer: |- + The distinction is documented in [Academia and Industry](https://docs.google.com/document/d/1wyJmDOWDEKItg0QhO5cpsNAgHq4aHOxQQZnTfzm34gI/edit). + Systems are categorized as “Industry” if their authors are affiliated with private sector organizations, “Academia” if the authors are affiliated with universities or academic institutions, or “Industry - Academia Collaboration” when at least 30% of the authors are from each. + Possible values: Industry, Research Collective, Academia, Industry - Academia Collaboration (Industry leaning), Industry - Academia Collaboration (Academia leaning), Non-profit + unit: 'AI systems' + short_unit: '' + display: + numDecimalPlaces: 0 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + title: Notable AI systems by researcher affiliation + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_aggregates_affiliation: + variables: + yearly_count: + title: Annual number of AI systems by researcher affiliation + + cumulative_count: + title: Cumulative number of AI systems by researcher affiliation diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation.py b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation.py new file mode 100644 index 00000000000..7bcbf76a4d8 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation.py @@ -0,0 +1,75 @@ +"""Generate aggregated table for total yearly and cumulative number of notable AI systems in each category of researcher affiliation.""" + +import datetime as dt + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_aggregates_affiliation.start") + + # + # Load inputs. + # + # Load the the garden dataset without aggregations. + ds_garden = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_garden["epoch"] + tb = tb.reset_index() + + # + # Process data. + # + # Store the origins metadata for later use + origins = tb["organization_categorization"].metadata.origins + + # Define the columns that are not needed + unused_columns = [ + "days_since_1949", + "parameters", + "training_dataset_size__datapoints", + "domain", + "training_computation_petaflop", + ] + # Drop the unused columns + tb = tb.drop(unused_columns, axis=1) + + # Ensure 'publication_date' column type is datetime64 + assert tb["publication_date"].dtype == "datetime64[ns]", "publication_date column is not of type datetime64" + + # Extract the year from the 'publication_date' column + tb["year"] = tb["publication_date"].dt.year + + # Group by year and country and count the number of systems + tb_agg = tb.groupby(["year", "organization_categorization"], observed=False).size().reset_index(name="yearly_count") + + # Calculate the cumulative count + tb_agg["cumulative_count"] = tb_agg.groupby("organization_categorization", observed=False)["yearly_count"].cumsum() + + # Add the origins metadata to the columns + for col in ["yearly_count", "cumulative_count"]: + tb_agg[col].metadata.origins = origins + + # Set the short_name metadata of the table + tb_agg.metadata.short_name = paths.short_name + + # Set the index to year and country + tb_agg = tb_agg.format(["year", "organization_categorization"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + ds_garden.save() + + paths.log.info("epoch_aggregates_affiliation.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain.meta.yml new file mode 100644 index 00000000000..6bfd0b50c0b --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain.meta.yml @@ -0,0 +1,53 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {TODAY}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + description_short: Describes the specific area, application, or field in which an AI system is designed to operate. An AI system can operate in more than one domain, thus contributing to the count for multiple domains. {definitions.desc_update} + description_key: + - Game systems are specifically designed for games and excel in understanding and strategizing gameplay. For instance, AlphaGo, developed by DeepMind, defeated the world champion in the game of Go. Such systems use complex algorithms to compete effectively, even against skilled human players. + + - Language systems are tailored to process language, focusing on understanding, translating, and interacting with human languages. Examples include chatbots, machine translation tools like Google Translate, and sentiment analysis algorithms that can detect emotions in text. + + - Multimodal systems are artificial intelligence frameworks that integrate and interpret more than one type of data input, such as text, images, and audio. ChatGPT-4 is an example of a multimodal system, as it has the capability to process and generate responses based on both textual and visual inputs. + + - Vision systems focus on processing visual information, playing a pivotal role in image recognition and related areas. For example, Facebook's photo tagging system uses vision AI to identify faces. + + - Speech systems are dedicated to handling spoken language, serving as the backbone of voice assistants and similar applications. They recognize, interpret, and generate spoken language to interact with users. + + - Recommendation systems offer suggestions based on user preferences, prominently seen in online shopping and media streaming. For instance, Netflix's movie suggestions or Amazon's product recommendations are powered by algorithms that analyze users' preferences and past behaviors. + + - Audio systems process and generate sound, with applications in music composition, signal processing, and sound recognition. + + - Biology systems analyze biological data and simulate biological processes, aiding in drug discovery and genetic research. + + - Image generation systems create visual content from text descriptions or other inputs, used in graphic design and content creation. + + - Robotics systems combine AI with mechanical engineering to create autonomous robots for various industries. + + - Video systems analyze and generate video content, aiding in editing, surveillance, and content creation. + description_processing: The count of notable AI systems per domain is derived by tallying the instances of machine learning models classified under each domain category. It's important to note that a single machine learning model can fall under multiple domains. The classification into domains is determined by the specific area, application, or field that the AI system is primarily designed to operate within. System domains with less than 10 systems are grouped under "Other." + description_from_producer: A foreign key field categorizing the system’s domain of machine learning. This field links to the [ML Domains table](https://airtable.com/appDFXXgaG1xLtXGL/shrhzolGiQCVnwOY5/tbleYEsZORsiYRVTM), and domains are selected from the options in that table. + unit: 'AI systems' + short_unit: '' + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + title: Notable AI systems by domain type + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_aggregates_domain: + variables: + yearly_count: + title: Annual number of AI systems by domain + + cumulative_count: + title: Cumulative number of AI systems by domain diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain.py b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain.py new file mode 100644 index 00000000000..944913c297f --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_aggregates_domain.py @@ -0,0 +1,99 @@ +"""Generate aggregated table for total yearly and cumulative number of notable AI systems for each domain.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_aggregates_domain.start") + + # + # Load inputs. + # + # Load the ds_meadow dataset. + ds_meadow = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_meadow["epoch"] + tb = tb.reset_index() + + # + # Process data. + # + + # Store the origins metadata for later use + origins = tb["domain"].metadata.origins + + # Select the rows where the 'notability_criteria' column is not null (only consider notable systems) + tb = tb[tb["notability_criteria"].notna()].reset_index(drop=True) + + # Define the columns that are not needed + unused_columns = [ + "authors", + "country__from_organization", + "organization", + "organization_categorization", + "parameters", + "training_compute__flop", + "training_dataset_size__datapoints", + "notability_criteria", + ] + # Drop the unused columns + tb = tb.drop(unused_columns, axis=1) + + # Convert the 'publication_date' column to datetime format and extract the year + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + tb["year"] = tb["publication_date"].dt.year + + # Split the column to be aggregated by comma (several countries/domains can exist in each cell) + tb["domain"] = tb["domain"].str.split(",") + + # Explode the table to create separate rows for each country or domain + tb_exploded = tb.explode("domain") + + # Drop duplicates where the year, system and country/domain are the same + tb_unique = tb_exploded.drop_duplicates(subset=["year", "system", "domain"]) + + # Replace domains with less than 10 systems with 'Other' + domain_counts = tb_unique["domain"].value_counts() + + tb_unique["domain"] = tb_unique["domain"].where(tb_unique["domain"].map(domain_counts) >= 10, "Other") + # Get the domains that were reclassified to 'Other' + reclassified_domains = domain_counts[domain_counts < 10].index.tolist() + domain_counts = tb_unique["domain"].value_counts() + + paths.log.info( + f"Domains with less than 10 notable systems that were reclassified to 'Other': {', '.join(reclassified_domains)}" + ) + # Convert the column to category type so that the missing values will be considered as 0 + tb_unique["domain"] = tb_unique["domain"].astype("category") + + # Group by year and country/domain and count the number of systems (consider all categories which will assume 0 for missing values) + tb_agg = tb_unique.groupby(["year", "domain"], observed=False).size().reset_index(name="yearly_count") + + # Calculate the cumulative count (consider all categories which will assume 0 for missing values) + tb_agg["cumulative_count"] = tb_agg.groupby("domain", observed=False)["yearly_count"].cumsum() + + # Add the origins metadata to the columns + for col in ["yearly_count", "cumulative_count"]: + tb_agg[col].metadata.origins = origins + + # Set the short_name metadata of the table + tb_agg.metadata.short_name = paths.short_name + # Set the index to year and domain + tb_agg = tb_agg.format(["year", "domain"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb_agg]) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_aggregates_domain.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive.meta.yml new file mode 100644 index 00000000000..1c00a1fb21f --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive.meta.yml @@ -0,0 +1,91 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + grapher_config: + note: Confirmed large-scale AI models are those where the training compute exceeds 10²³ floating-point operations (FLOP). +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_compute_intensive: + variables: + domain: + title: Domain + unit: '' + short_unit: '' + description_short: Refers to the specific area, application, or field in which an AI system is designed to operate. + display: + zeroDay: '1949-01-01' + yearIsDay: true + + + parameters: + title: Number of parameters + unit: '' + description_short: Total number of learnable variables or weights that the model contains. Parameters are adjusted during the training process to optimize the model's performance. + description_key: + - Parameters are internal variables that machine learning models adjust during their training process to improve their ability to make accurate predictions. They act as the model's "knobs" that are fine-tuned based on the provided data. In deep learning, a subset of artificial intelligence (AI), parameters primarily consist of the weights assigned to the connections between the small processing units called neurons. Picture a vast network of interconnected neurons where the strength of each connection represents a parameter. + + - The total number of parameters in a model is influenced by various factors. The model's structure and the number of “layers” of neurons play a significant role. Generally, more complex models with additional layers tend to have a higher number of parameters. Special components of specific deep learning architectures can further contribute to the overall parameter count. + + - Understanding the number of parameters in a model is crucial to design effective models. More parameters can help the model understand complex data patterns, potentially leading to higher accuracy. However, there's a fine balance to strike. If a model has too many parameters, it risks memorizing the specific examples in its training data rather than learning their underlying patterns. Consequently, it may perform poorly when presented with new, unseen data. Achieving the right balance of parameters is a critical consideration in model development. + + - In recent times, the AI community has witnessed the emergence of what are often referred to as "giant models." These models boast an astounding number of parameters, reaching into the billions or even trillions. While these huge models have achieved remarkable performance, they have a significant computational cost. Effectively managing and training such large-scale models has become a prominent and active area of research and discussion within the AI field. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_dataset_size__datapoints: + title: Training dataset size + unit: 'datapoints' + description_short: The number of examples provided to train an AI model. Typically, more data results in a more comprehensive understanding by the model. + description_key: + - Training data size refers to the volume of data employed to train an artificial intelligence (AI) model effectively. It's a representation of the number of examples that the model learns from during its training process. It is a fundamental measure of the scope of the data used in the model's learning phase. + + - To grasp the concept of training data size, imagine teaching a friend the art of distinguishing different types of birds. In this analogy, each bird picture presented to your friend corresponds to an individual piece of training data. If you showed them 100 unique bird photos, then the training data size in this scenario would be quantified as 100. + + - Training data size is an essential indicator in AI and machine learning. First and foremost, it directly impacts the depth of learning achieved by the model. The more extensive the dataset, the more profound and comprehensive the model's understanding of the subject matter becomes. Additionally, a large training data size contributes significantly to improved recognition capabilities. By exposing the model to a diverse array of examples, it becomes adept at identifying subtle nuances, much like how it becomes skilled at distinguishing various bird species through exposure to a large variety of bird images. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_computation_petaflop: + title: Training computation (petaFLOP) + unit: 'petaFLOP' + description_short: Computation is measured in total petaFLOP, which is 10¹⁵ [floating-point operations](#dod:flop) estimated from AI literature, albeit with some uncertainty. + description_key: + - In the context of artificial intelligence (AI), training computation is predominantly measured using floating-point operations or “FLOP”. One FLOP represents a single arithmetic operation involving floating-point numbers, such as addition, subtraction, multiplication, or division. To adapt to the vast computational demands of AI systems, the measurement unit of petaFLOP is commonly used. One petaFLOP stands as a staggering one quadrillion FLOPs, underscoring the magnitude of computational operations within AI. + + - Modern AI systems are rooted in machine learning and deep learning techniques. These methodologies are notorious for their computational intensity, involving complex mathematical processes and algorithms. During the training phase, AI models process large volumes of data, while continuously adapting and refining their parameters to optimize performance, rendering the training process computationally intensive. + + - Many factors influence the magnitude of training computation within AI systems. Notably, the size of the dataset employed for training significantly impacts the computational load. Larger datasets necessitate more processing power. The complexity of the model's architecture also plays a pivotal role; more intricate models lead to more computations. Parallel processing, involving the simultaneous use of multiple processors, also has a substantial effect. Beyond these factors, specific design choices and other variables further contribute to the complexity and scale of training computation within AI. + + description_processing: Training computation was converted from its original measurement in FLOPs (floating-point operations) to a more manageable unit known as petaFLOPs. This conversion is performed by dividing the original training compute value by 1e15, which represents one quadrillion (10^15). The purpose of this conversion is to provide a more human-readable and practical representation of the immense computational efforts involved in training AI systems. By expressing the training computation in petaFLOPs, it becomes easier to grasp the scale and magnitude of the computational resources required for training these systems, especially when dealing with large datasets and complex architectures. + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + presentation: + grapher_config: + title: Training computation + + publication_date: + title: Publication date + unit: '' + description_short: The date when the AI system was first published. + description_from_producer: The publication, announcement, or release date of the model, in YYYY-MM-DD format. If the year and month are known but the day is unknown, the day is filled in as YYYY-MM-15. If the year is known but the month and day are unknown, the month and day are filled in as YYYY-07-01. + + + diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive.py b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive.py new file mode 100644 index 00000000000..4eb3048784b --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive.py @@ -0,0 +1,60 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("epoch_compute_intensive") + + # Read table from meadow dataset. + tb = ds_meadow["epoch_compute_intensive"] + tb = tb.reset_index() + + # + # Process data. + # + # Convert FLOP to petaFLOP and remove the column with FLOPs (along with training time in hours) + tb["training_computation_petaflop"] = tb["training_compute__flop"] / 1e15 + + # Convert publication date to a datetime objects + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + + # Calculate 'days_since_1949' + tb["days_since_1949"] = (tb["publication_date"] - pd.to_datetime("1949-01-01")).dt.days.astype("Int64") + tb = tb.dropna(subset=["days_since_1949"]) + + tb = tb.reset_index(drop=True) + + assert not tb[["system", "days_since_1949"]].isnull().any().any(), "Index columns should not have NaN values" + + # Drop columns that are not needed + tb = tb.drop( + ["training_compute__flop", "organization", "authors", "country__from_organization"], + axis=1, + ) + tb = tb.format(["days_since_1949", "system"]) + + # Add metadata to the publication date column + tb["publication_date"].metadata.origins = tb["domain"].metadata.origins + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_compute_intensive.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.countries.json b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.countries.json new file mode 100644 index 00000000000..ddfda66807a --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.countries.json @@ -0,0 +1,18 @@ +{ + "Canada": "Canada", + "China": "China", + "Germany": "Germany", + "Israel": "Israel", + "Singapore": "Singapore", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United States of America": "United States", + "Korea (Republic of)": "South Korea", + "Multinational": "Multinational", + "Russia": "Russia", + "Japan": "Japan", + "France": "France", + "Finland": "Finland", + "Total": "Total", + "Hong Kong": "Hong Kong" +} \ No newline at end of file diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.meta.yml new file mode 100644 index 00000000000..f7e374788dc --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.meta.yml @@ -0,0 +1,31 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {TODAY}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + grapher_config: + note: Confirmed large-scale AI models are those where the training compute exceeds 10²³ floating-point operations (FLOP). + + unit: 'AI systems' + short_unit: '' + description_short: Refers to the location of the primary organization with which the authors of a large-scale AI systems are affiliated. {definitions.desc_update} + description_processing: The number of large-scale AI systems by country is determined by tallying the number of machine learning models that are associated with the geographical location of the researchers' affiliated institutions. It's important to note that a single model can have multiple authors, each potentially affiliated with different institutions, thus contributing to the count for multiple countries. +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ + +dataset: + update_period_days: 31 + title: Large-scale AI systems by country +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_compute_intensive_countries: + variables: + yearly_count: + title: Annual number of large-scale AI systems by country + + cumulative_count: + title: Cumulative number of large-scale AI systems by country diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.py b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.py new file mode 100644 index 00000000000..c17ae31d9e9 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.py @@ -0,0 +1,59 @@ +"""Generate aggregated table for total yearly and cumulative number of compute intensive AI systems in each country.""" + +import shared as sh + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_compute_intensive_countries.start") + + # + # Load inputs. + # + # Load the ds_meadow dataset. + ds_meadow = paths.load_dataset("epoch_compute_intensive") + + # Read table from meadow dataset. + tb = ds_meadow["epoch_compute_intensive"] + tb = tb.reset_index() + + # + # Process data. + # + # Define the columns that are not needed + unused_columns = [ + "domain", + "authors", + "organization", + "parameters", + "training_compute__flop", + "training_dataset_size__datapoints", + ] + + # Aggregate the data by country + tb_agg = sh.calculate_aggregates(tb, "country__from_organization", paths.short_name, unused_columns) + + # Rename the 'country__from_organization' column to 'country' + tb_agg = tb_agg.rename(columns={"country__from_organization": "country"}) + + # Harmonize the country names + tb_agg = geo.harmonize_countries(df=tb_agg, countries_file=paths.country_mapping_path) + + # Set the index to year and country + tb_agg = tb_agg.format(["year", "country"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb_agg]) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_compute_intensive_countries.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain.meta.yml new file mode 100644 index 00000000000..c262daad047 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain.meta.yml @@ -0,0 +1,48 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {TODAY}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + grapher_config: + note: Confirmed large-scale AI models are those where the training compute exceeds 10²³ floating-point operations (FLOP). + description_short: Describes the specific area, application, or field in which a large-scale AI model is designed to operate. {definitions.desc_update} + description_key: + - Game systems are specifically designed for games and excel in understanding and strategizing gameplay. For instance, AlphaGo, developed by DeepMind, defeated the world champion in the game of Go. Such systems use complex algorithms to compete effectively, even against skilled human players. + + - Language systems are tailored to process language, focusing on understanding, translating, and interacting with human languages. Examples include chatbots, machine translation tools like Google Translate, and sentiment analysis algorithms that can detect emotions in text. + + - Multimodal systems are artificial intelligence frameworks that integrate and interpret more than one type of data input, such as text, images, and audio. ChatGPT-4 is an example of a multimodal system, as it has the capability to process and generate responses based on both textual and visual inputs. + + - Vision systems focus on processing visual information, playing a pivotal role in image recognition and related areas. For example, Facebook's photo tagging system uses vision AI to identify faces. + + - Speech systems are dedicated to handling spoken language, serving as the backbone of voice assistants and similar applications. They recognize, interpret, and generate spoken language to interact with users. + + - Biology systems analyze biological data and simulate biological processes, aiding in drug discovery and genetic research. + + - Image generation systems create visual content from text descriptions or other inputs, used in graphic design and content creation. + + description_processing: The count of large-scale AI models AI systems per domain is derived by tallying the instances of machine learning models classified under each domain category. It's important to note that a single machine learning model can fall under multiple domains. The classification into domains is determined by the specific area, application, or field that the AI system is primarily designed to operate within. + description_from_producer: A foreign key field categorizing the system’s domain of machine learning. This field links to the [ML Domains table](https://airtable.com/appDFXXgaG1xLtXGL/shrhzolGiQCVnwOY5/tbleYEsZORsiYRVTM), and domains are selected from the options in that table. + unit: 'AI systems' + short_unit: '' + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + title: Large-scale AI systems by domain type + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_compute_intensive_domain: + variables: + yearly_count: + title: Annual number of large-scale AI models by domain + + cumulative_count: + title: Cumulative number of large-scale AI models by domain diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain.py b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain.py new file mode 100644 index 00000000000..01a8cfe0980 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain.py @@ -0,0 +1,52 @@ +"""Generate aggregated table for total yearly and cumulative number of compute intensive AI systems for each domain.""" + +import shared as sh + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_compute_intensive_domain.start") + + # + # Load inputs. + # + # Load the ds_meadow dataset. + ds_meadow = paths.load_dataset("epoch_compute_intensive") + + # Read table from meadow dataset. + tb = ds_meadow["epoch_compute_intensive"] + tb = tb.reset_index() + + # + # Process data. + # + # Define the columns that are not needed + unused_columns = [ + "authors", + "country__from_organization", + "organization", + "parameters", + "training_compute__flop", + "training_dataset_size__datapoints", + ] + + # Aggregate the data by domain + tb_agg = sh.calculate_aggregates(tb, "domain", paths.short_name, unused_columns) + + # Set the index to year and domain + tb_agg = tb_agg.format(["year", "domain"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb_agg]) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_compute_intensive_domain.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-09-09/shared.py b/etl/steps/data/garden/artificial_intelligence/2024-09-09/shared.py new file mode 100644 index 00000000000..f9ac6876d20 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-09-09/shared.py @@ -0,0 +1,74 @@ +from typing import List + +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def calculate_aggregates(tb: Table, agg_column: str, short_name: str, unused_columns: List[str]) -> Table: + """ + This function calculates aggregates for a given column in a Table. It is used to calculate the total yearly and cumulative number of notable AI systems for each domain or country. + + Parameters: + tb (Table): The input Table. + agg_column (str): The column to aggregate on. + short_name (str): The short name to set for the table. + unused_columns (List[str]): The list of columns to drop from the table. + + Returns: + Table: The output Table with calculated aggregates. + """ + + # Store the origins metadata for later use + origins = tb[agg_column].metadata.origins + + # Drop the unused columns + tb = tb.drop(unused_columns, axis=1) + + # Convert the 'publication_date' column to datetime format and extract the year + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + tb["year"] = tb["publication_date"].dt.year + + # Convert the column to category type so that the missing values will be considered as 0 + tb[agg_column] = tb[agg_column].astype("category") + + # Group total yearly counts and calculate cumulative count for total number of systems + tb_total = tb.groupby(["year"]).size().reset_index(name="yearly_count") + total_counts = tb_total.groupby("year")["yearly_count"].sum().reset_index() + total_counts[agg_column] = "Total" + total_counts["cumulative_count"] = total_counts["yearly_count"].cumsum() + + # Split the column to be aggregated by comma (several countries/domains can exist in each cell) + tb[agg_column] = tb[agg_column].str.split(",") + + # Explode the table to create separate rows for each country or domain + tb_exploded = tb.explode(agg_column) + + # Convert the column to category type so that the missing values will be considered as 0 + tb_exploded[agg_column] = tb_exploded[agg_column].astype("category") + + # Drop duplicates where the year, system and country/domain are the same + tb_unique = tb_exploded.drop_duplicates(subset=["year", "system", agg_column]) + + # Group by year and country/domain and count the number of systems (consider all categories which will assume 0 for missing values) + tb_agg = tb_unique.groupby(["year", agg_column], observed=False).size().reset_index(name="yearly_count") + + # Calculate the cumulative count (consider all categories which will assume 0 for missing values) + tb_agg["cumulative_count"] = tb_agg.groupby(agg_column, observed=False)["yearly_count"].cumsum() + + # Combine aggregated data with total counts + tb_agg = pr.concat([tb_agg, total_counts], ignore_index=True) + + # Add the origins metadata to the columns + for col in ["yearly_count", "cumulative_count"]: + tb_agg[col].metadata.origins = origins + + # Set the short_name metadata of the table + tb_agg.metadata.short_name = short_name + + return tb_agg diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.meta.yml b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.meta.yml new file mode 100644 index 00000000000..af50f790b40 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.meta.yml @@ -0,0 +1,18 @@ +definitions: + common: + unit: '' + short_unit: '' + display: + zeroDay: '1949-01-01' + yearIsDay: true + +tables: + epoch: + variables: + max_compute: + title: Maximum compute + max_data: + title: Maximum data + max_parameters: + title: Maximum parameters + diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.py b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.py new file mode 100644 index 00000000000..df2c6cc82ad --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.py @@ -0,0 +1,89 @@ +"""Load a garden dataset and create a grapher dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch") + + # Read table from garden dataset. + tb = ds_garden["epoch"].reset_index() + # + # Process data. + # + # Extract year from 'publication_date' and create a new 'year' column + tb["year"] = tb["publication_date"].dt.year + + # For visualization purposes I am adding the rows with the maximum values of compute, data, and parameters in each year to the table as a separate "system". I don't want to do this in garden as it'd affect other datasets that depend on this one. + columns = { + "training_computation_petaflop": "compute", + "training_dataset_size__datapoints": "data", + "parameters": "parameters", + } + # Find maximum values for a given column (compute, data, params) per year, label them, and add summary rows. + for column, label in columns.items(): + tb = find_max_label_and_concat(tb, column, label) + + # Update metadata + for col in ["max_compute", "max_parameters", "max_data"]: + tb[col].metadata.origins = tb["system"].metadata.origins + + # Drop year as we don't need it anymore + tb = tb.drop("year", axis=1) + + # Rename for plotting model name as country in grapher + tb = tb.rename(columns={"system": "country", "days_since_1949": "year"}) + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], default_metadata=ds_garden.metadata, check_variables_metadata=True + ) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() + + +def find_max_label_and_concat(tb, column, label): + """ + Find maximum values for a given column per year, label them, and add summary rows. + + This function: + 1. Identifies rows with maximum values for the specified column in each year. + 2. Labels these maximum value rows in a new column using their original system names. + 3. Creates new summary rows for these maximum values. + 4. Adds these new summary rows to the original table. + + Note: + - Creates a new column named f"max_{label}" to indicate maximum values. + - Preserves original data and system names. + - Adds new summary rows with "system" set to f"Maximum {label}". + """ + idx = tb[[column, "year"]].fillna(0).groupby("year")[column].idxmax() + + tb[f"max_{label}"] = "Other" + tb.loc[idx, f"max_{label}"] = f"Maximum {label}" + + max_rows = tb.loc[idx].copy() + max_rows["system"] = f"Maximum {label}" + + tb = pr.concat([tb, max_rows], ignore_index=True) + + return tb diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation.py b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation.py new file mode 100644 index 00000000000..6582a86db80 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_affiliation.py @@ -0,0 +1,41 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_affiliation") + + # Read table from garden dataset. + tb = ds_garden["epoch_aggregates_affiliation"] + + # + # Process data. + # + # Rename for plotting research affiliation as country in grapher + tb = tb.rename_index_names( + { + "organization_categorization": "country", + } + ) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_countries.py b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_countries.py new file mode 100644 index 00000000000..658d7982804 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_countries.py @@ -0,0 +1,30 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_countries") + + # Read table from garden dataset. + tb_garden = ds_garden["epoch_aggregates_countries"] + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_domain.py b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_domain.py new file mode 100644 index 00000000000..fb2fa66d43b --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_domain.py @@ -0,0 +1,39 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_domain") + + # Read table from garden dataset. + tb = ds_garden["epoch_aggregates_domain"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "domain": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_organizations.py b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_organizations.py new file mode 100644 index 00000000000..f479f165881 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_aggregates_organizations.py @@ -0,0 +1,38 @@ +"""Load a garden dataset and create a grapher dataset.""" +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_organizations") + + # Read table from garden dataset. + tb = ds_garden["epoch_aggregates_organizations"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "organization": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive.py b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive.py new file mode 100644 index 00000000000..323e67bd023 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive.py @@ -0,0 +1,33 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive") + + # Read table from garden dataset. + tb = ds_garden["epoch_compute_intensive"] + + # + # Process data. + # + # Rename for plotting model name as country in grapher + tb = tb.rename_index_names({"system": "country", "days_since_1949": "year"}) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.py b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.py new file mode 100644 index 00000000000..ef0aea55b10 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_countries.py @@ -0,0 +1,30 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive_countries") + + # Read table from garden dataset. + tb_garden = ds_garden["epoch_compute_intensive_countries"] + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain.py b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain.py new file mode 100644 index 00000000000..efb5fea33ce --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_domain.py @@ -0,0 +1,39 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive_domain") + + # Read table from garden dataset. + tb = ds_garden["epoch_compute_intensive_domain"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "domain": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_organizations.py b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_organizations.py new file mode 100644 index 00000000000..9478c5e5e42 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch_compute_intensive_organizations.py @@ -0,0 +1,38 @@ +"""Load a garden dataset and create a grapher dataset.""" +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive_organizations") + + # Read table from garden dataset. + tb = ds_garden["epoch_compute_intensive_organizations"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "organization": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch.py b/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch.py new file mode 100644 index 00000000000..0a18c82226a --- /dev/null +++ b/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch.py @@ -0,0 +1,73 @@ +"""Load a snapshot and create a meadow dataset.""" + +import numpy as np + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("epoch.csv") + + # Read snapshot + tb = snap.read() + + # + # Process data. + # + # Define columns of interest. + cols = [ + "System", + "Domain", + "Authors", + "Country (from Organization)", + "Organization", + "Organization categorization", + "Publication date", + "Parameters", + "Training compute (FLOP)", + "Training dataset size (datapoints)", + "Notability criteria", + ] + + # Check that the columns of interest are present + for col in cols: + assert col in tb.columns, f"Column '{col}' is missing from the dataframe." + + # Select the columns of interest + tb = tb[cols] + # Replace empty strings with NaN values + tb = tb.replace("", np.nan) + # Remove rows where all values are NaN + tb = tb.dropna(how="all") + + # Convert the training compute column to float + tb["Training compute (FLOP)"] = tb["Training compute (FLOP)"].astype(float) + + # Replace the missing values in the system column with the organization column. If organization column is NaN as well replace the missing values in the system column with the authors column + tb["System"] = tb["System"].fillna(tb["Organization"]).fillna(tb["Authors"]) + # Check that there are no NaN values in the system column + assert not tb["System"].isna().any(), "NaN values found in 'System' column after processing." + # + # Create a new table. + # + tb = tb.format(["system", "publication_date"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # Save changes in the new garden dataset. + ds_meadow.save() + + paths.log.info("epoch.end") diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive.py b/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive.py new file mode 100644 index 00000000000..bfb27c7bb46 --- /dev/null +++ b/etl/steps/data/meadow/artificial_intelligence/2024-09-09/epoch_compute_intensive.py @@ -0,0 +1,66 @@ +"""Load a snapshot and create a meadow dataset.""" + +import numpy as np + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("epoch_compute_intensive.csv") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + # + # Define columns of interest. + cols = [ + "System", + "Domain", + "Authors", + "Country (from Organization)", + "Organization", + "Publication date", + "Parameters", + "Training compute (FLOP)", + "Training dataset size (datapoints)", + ] + + # Check that the columns of interest are present + for col in cols: + assert col in tb.columns, f"Column '{col}' is missing from the dataframe." + + # Select the columns of interest + tb = tb[cols] + # Replace empty strings with NaN values + tb = tb.replace("", np.nan) + # Remove rows where all values are NaN + tb = tb.dropna(how="all") + + # Convert the training compute column to float + tb["Training compute (FLOP)"] = tb["Training compute (FLOP)"].astype(float) + + # Replace the missing values in the system column with the organization column. If organization column is NaN as well replace the missing values in the system column with the authors column + tb["System"] = tb["System"].fillna(tb["Organization"]).fillna(tb["Authors"]) + # Check that there are no NaN values in the system column + assert not tb["System"].isna().any(), "NaN values found in 'System' column after processing." + # + # Create a new table. + # + tb = tb.format(["system", "publication_date"]) + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/artificial_intelligence/2024-09-09/epoch.csv.dvc b/snapshots/artificial_intelligence/2024-09-09/epoch.csv.dvc new file mode 100644 index 00000000000..5689cf49d67 --- /dev/null +++ b/snapshots/artificial_intelligence/2024-09-09/epoch.csv.dvc @@ -0,0 +1,37 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: Parameter, Compute and Data Trends in Machine Learning + date_published: 2024-07-19 + description_snapshot: | + We update this chart with the latest available data from our source every month. + + The authors selected the AI systems for inclusion based on the following necessary criteria: + — Have an explicit learning component + — Showcase experimental results + — Advance the state of the art + + In addition, the systems had to meet at least one of the following notability criteria: + — Paper has more than 1000 citations + — Historical importance + — Important state-of-the-art advance + — Deployed in a notable context + + The authors note that: "For new models (from 2020 onward) it is harder to assess these criteria, so we fall back to a subjective selection. We refer to models meeting our selection criteria as 'milestone models." + # Citation + producer: Epoch + citation_full: "Epoch AI, ‘Parameter, Compute and Data Trends in Machine Learning’. Published online at epochai.org. Retrieved from: ‘https://epochai.org/data/epochdb/visualization’ [online resource]" + # Files + url_main: https://epochai.org/mlinputs/visualization + url_download: https://epochai.org/data/epochdb/notable_ai_models.csv + date_accessed: 2024-09-09 + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ +outs: + - md5: 4ec3b15bb7ff1a09d20bfc0ac4dbfbe8 + size: 1380677 + path: epoch.csv diff --git a/snapshots/artificial_intelligence/2024-09-09/epoch.py b/snapshots/artificial_intelligence/2024-09-09/epoch.py new file mode 100644 index 00000000000..daa355e267f --- /dev/null +++ b/snapshots/artificial_intelligence/2024-09-09/epoch.py @@ -0,0 +1,33 @@ +"""Script to create a snapshot of dataset 'Parameter, Compute and Data Trends in Machine Learning (Epoch, 2023)'.""" + + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"artificial_intelligence/{SNAPSHOT_VERSION}/epoch.csv") + + # Download data from source. + snap.download_from_source() + + # Add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/artificial_intelligence/2024-09-09/epoch_compute_intensive.csv.dvc b/snapshots/artificial_intelligence/2024-09-09/epoch_compute_intensive.csv.dvc new file mode 100644 index 00000000000..1e130b6229e --- /dev/null +++ b/snapshots/artificial_intelligence/2024-09-09/epoch_compute_intensive.csv.dvc @@ -0,0 +1,32 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Tracking Compute-Intensive AI Models + description: |- + A dataset that tracks compute-intensive AI models, with training compute over 10²³ floating point operations (FLOP). This corresponds to training costs of hundreds of thousands of dollars or more.  + + To identify compute-intensive AI models, the team at Epoch AI used various resources, estimating compute when not directly reported. They included benchmarks and repositories, such as Papers With Code and Hugging Face, to find models exceeding 10²³ FLOP. They also explored non-English media and specific leaderboards, particularly focusing on Chinese sources. + + Additionally, they examined blog posts, press releases from major labs, and scholarly literature to track new models. A separate table was created for models with unconfirmed but plausible compute levels. Despite thorough methods, proprietary and secretive models may have been missed. + date_published: "2024-06-19" + + # Citation + producer: Epoch + citation_full: |- + Robi Rahman, David Owen and Josh You (2024), "Tracking Compute-Intensive AI Models". Published online at epochai.org. Retrieved from: 'https://epochai.org/blog/tracking-compute-intensive-ai-models' [online resource] + + # Files + url_main: https://epochai.org/blog/tracking-compute-intensive-ai-models + url_download: https://epochai.org/data/epochdb/large_scale_ai_models.csv + date_accessed: 2024-08-05 + + # License + license: + name: CC BY 4.0 + url: https://epochai.org/blog/how-much-does-it-cost-to-train-frontier-ai-models +outs: + - md5: 171b9b8a1b68f1ccca2c1a510761007a + size: 427089 + path: epoch_compute_intensive.csv diff --git a/snapshots/artificial_intelligence/2024-09-09/epoch_compute_intensive.py b/snapshots/artificial_intelligence/2024-09-09/epoch_compute_intensive.py new file mode 100644 index 00000000000..fdbd7822e4a --- /dev/null +++ b/snapshots/artificial_intelligence/2024-09-09/epoch_compute_intensive.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"artificial_intelligence/{SNAPSHOT_VERSION}/epoch_compute_intensive.csv") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() From 8e4b1b40c5dee1f08170200c83e4fb0aaa73317c Mon Sep 17 00:00:00 2001 From: veronikasamborska1994 <32176660+veronikasamborska1994@users.noreply.github.com> Date: Tue, 10 Sep 2024 10:27:56 +0200 Subject: [PATCH 2/3] keep maximum only if higher than previous year --- .../2024-09-09/epoch.py | 21 +++++++++++++++---- .../2024-09-09/epoch.csv.dvc | 4 ++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.py b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.py index df2c6cc82ad..9c26bbefcf7 100644 --- a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.py +++ b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.py @@ -1,6 +1,7 @@ """Load a garden dataset and create a grapher dataset.""" import owid.catalog.processing as pr +from owid.catalog import Table from etl.helpers import PathFinder, create_dataset, grapher_checks @@ -76,13 +77,25 @@ def find_max_label_and_concat(tb, column, label): - Preserves original data and system names. - Adds new summary rows with "system" set to f"Maximum {label}". """ - idx = tb[[column, "year"]].fillna(0).groupby("year")[column].idxmax() + tb = tb.sort_values(by=["year"]) # Ensure the DataFrame is sorted by year + max_value = -float("inf") + rows_to_keep = [] - tb[f"max_{label}"] = "Other" - tb.loc[idx, f"max_{label}"] = f"Maximum {label}" + for _, row in tb.iterrows(): + if row[column] > max_value: + max_value = row[column] + rows_to_keep.append(row) - max_rows = tb.loc[idx].copy() + tb_filtered = Table(rows_to_keep) + + idx = tb_filtered[[column, "year"]].fillna(0).groupby("year")[column].idxmax() + + tb_filtered[f"max_{label}"] = "Other" + tb_filtered.loc[idx, f"max_{label}"] = f"Maximum {label}" + + max_rows = tb_filtered.loc[idx].copy() max_rows["system"] = f"Maximum {label}" + print(max_rows) tb = pr.concat([tb, max_rows], ignore_index=True) diff --git a/snapshots/artificial_intelligence/2024-09-09/epoch.csv.dvc b/snapshots/artificial_intelligence/2024-09-09/epoch.csv.dvc index 5689cf49d67..2aac9cfe012 100644 --- a/snapshots/artificial_intelligence/2024-09-09/epoch.csv.dvc +++ b/snapshots/artificial_intelligence/2024-09-09/epoch.csv.dvc @@ -32,6 +32,6 @@ meta: name: CC BY 4.0 url: https://creativecommons.org/licenses/by/4.0/ outs: - - md5: 4ec3b15bb7ff1a09d20bfc0ac4dbfbe8 - size: 1380677 + - md5: faffb7413e77dbdab030d30ce12d59bd + size: 1390151 path: epoch.csv From f78755c0e5e8476b680bdb7fecde227eeb4b0e44 Mon Sep 17 00:00:00 2001 From: veronikasamborska1994 <32176660+veronikasamborska1994@users.noreply.github.com> Date: Tue, 10 Sep 2024 10:28:09 +0200 Subject: [PATCH 3/3] Update epoch.py --- .../data/grapher/artificial_intelligence/2024-09-09/epoch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.py b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.py index 9c26bbefcf7..435e1f6c9d2 100644 --- a/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.py +++ b/etl/steps/data/grapher/artificial_intelligence/2024-09-09/epoch.py @@ -95,7 +95,6 @@ def find_max_label_and_concat(tb, column, label): max_rows = tb_filtered.loc[idx].copy() max_rows["system"] = f"Maximum {label}" - print(max_rows) tb = pr.concat([tb, max_rows], ignore_index=True)