diff --git a/dag/artificial_intelligence.yml b/dag/artificial_intelligence.yml index 3a8cb2a4fa5..33485d7652a 100644 --- a/dag/artificial_intelligence.yml +++ b/dag/artificial_intelligence.yml @@ -206,10 +206,3 @@ steps: data://grapher/artificial_intelligence/2023-07-07/semiconductors_cset: - data://garden/artificial_intelligence/2023-07-07/semiconductors_cset -# CSET data on patents, articles and private investment - data://meadow/artificial_intelligence/2023-07-23/cset: - - snapshot://artificial_intelligence/2023-07-23/cset.csv - data://garden/artificial_intelligence/2023-07-23/cset: - - data://meadow/artificial_intelligence/2023-07-23/cset - data://grapher/artificial_intelligence/2023-07-23/cset: - - data://garden/artificial_intelligence/2023-07-23/cset diff --git a/snapshots/artificial_intelligence/2023-07-23/cset.csv.dvc b/snapshots/artificial_intelligence/2023-07-23/cset.csv.dvc deleted file mode 100644 index a5945d8209e..00000000000 --- a/snapshots/artificial_intelligence/2023-07-23/cset.csv.dvc +++ /dev/null @@ -1,28 +0,0 @@ -meta: - name: 'Country Activity Tracker: Artificial Intelligence (Center for Security and - Emerging Technology, 2023)' - publication_year: 2023 - - publication_date: '2023-07-21' - source_name: Center for Security and Emerging Technology (2023) - source_published_by: Emerging Technology Observatory Country Activity Tracker, Artificial - Intelligence (Center for Security and Emerging Technology, 2023) - url: https://cat.eto.tech/ - source_data_url: - license_url: https://eto.tech/tou/ - license_name: Creative Commons BY 4.0 - date_accessed: 2023-07-23 - is_public: true - description: | - The research data in CAT (Country Attributes and Topics) is derived from ETO's Merged Academic Corpus (MAC), which contains detailed information on over 270 million scholarly articles worldwide. CAT uses only AI-related articles from the MAC. Articles are attributed to countries based on the author organizations listed in each article's metadata. An article is attributed to a country if it lists at least one author affiliated with an organization in that country. - - The top ten authors for each country are identified based on the number of citations to articles they released while affiliated with institutions in that country. CAT classifies articles into AI subfields using subject assignment scores in the MAC. Articles are assigned to up to three subfields based on their scores. - - CAT includes patent data from 1790 Analytics and Dimensions, and it counts AI-related patent families, including patent applications and granted patents. Patents are attributed to the country where they are filed, not necessarily the inventor's nationality. CAT also uses Crunchbase data to identify AI-related companies based on various criteria and includes investment metrics for these companies. - - The data in CAT is updated at least once a quarter, with plans for more frequent updates in the future. -wdir: ../../../data/snapshots/artificial_intelligence/2023-07-23 -outs: -- md5: afe33f3d16404cf7527a43c5189847e8 - size: 1658442 - path: cset.csv diff --git a/snapshots/artificial_intelligence/2023-07-23/cset.py b/snapshots/artificial_intelligence/2023-07-23/cset.py deleted file mode 100644 index 84f6ac8103c..00000000000 --- a/snapshots/artificial_intelligence/2023-07-23/cset.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Script to create a snapshot of dataset 'Country Activity Tracker: Artificial Intelligence (Center for Security and Emerging Technology, 2023)'.""" - -from pathlib import Path - -import click -import pandas as pd -from owid.datautils.io import df_to_file - -from etl.snapshot import Snapshot - -# Version for current snapshot dataset. -SNAPSHOT_VERSION = Path(__file__).parent.name - - -@click.command() -@click.option( - "--upload/--skip-upload", - default=True, - type=bool, - help="Upload dataset to Snapshot", -) -def main(upload: bool) -> None: - # Create a new snapshot. - snap = Snapshot(f"artificial_intelligence/{SNAPSHOT_VERSION}/cset.csv") - common_path = "/Users/veronikasamborska/Downloads/owid_cat_data_20230717/" - - files = { - "companies": ["companies_yearly_disclosed.csv", "companies_yearly_estimated.csv"], - "patents": ["patents_yearly_applications.csv", "patents_yearly_granted.csv"], - "articles": ["publications_yearly_articles.csv", "publications_yearly_citations.csv"], - } - - all_dfs = [] - for field, file_ids in files.items(): - all_dfs.append(read_and_clean_data(file_ids, common_path, field)) - - result = pd.concat(all_dfs) - df_to_file(result, file_path=snap.path) - # Add file to DVC and upload to S3. - snap.dvc_add(upload=upload) - - -def read_and_clean_data(file_ids, common_path, field_name): - all_dfs_list = [] - for id in file_ids: - df_add = pd.read_csv(common_path + id) - if "estimated" in id: - df_add.rename(columns={"disclosed_investment": "disclosed_investment_estimated"}, inplace=True) - all_dfs_list.append(df_add) - - merged_df = all_dfs_list[0] - for df in all_dfs_list[1:]: - merged_df = pd.merge(merged_df, df, on=["year", "country", "field"]) - - merged_df.rename(columns={"field": field_name}, inplace=True) - return merged_df - - -if __name__ == "__main__": - main()