diff --git a/etl/steps/data/garden/artificial_intelligence/latest/epoch.py b/etl/steps/data/garden/artificial_intelligence/latest/epoch.py index 764785f5ced..e5af4577835 100644 --- a/etl/steps/data/garden/artificial_intelligence/latest/epoch.py +++ b/etl/steps/data/garden/artificial_intelligence/latest/epoch.py @@ -1,15 +1,17 @@ """Load a meadow dataset and create a garden dataset.""" - import pandas as pd +from structlog import get_logger from etl.helpers import PathFinder, create_dataset +log = get_logger() + # Get paths and naming conventions for current step. paths = PathFinder(__file__) def run(dest_dir: str) -> None: - paths.log.info("epoch.start") + log.info("epoch.start") # # Load inputs. @@ -23,111 +25,64 @@ def run(dest_dir: str) -> None: # # Process data. - # # Filter notable systems by selecting rows where 'notability_criteria' is not nan tb = tb[tb["notability_criteria"].notna()].reset_index(drop=True) tb = tb.drop("notability_criteria", axis=1) # Convert relevant columns to string type - columns = ["system", "domain", "organization_categorization"] - tb[columns] = tb[columns].astype(str) - - def simplify_entry(entry): - """ - Simplifies an entry of organization categories which can include many entries of Industry, Academia etc. - Removes duplicates, ensures all words except the first one start with a lower case letter,and joins the categories with ", " and " and " before the last one. - """ - # Check for "nan" - if entry == "nan": - return "Not specified" - - # Split the entry into categories, convert to set to remove duplicates - categories = sorted(set(entry.split(","))) - - # Make sure all words except the first one start with a lower case letter - categories = [categories[0]] + [category.lower() for category in categories[1:]] - - # Join the categories with ", " and " and " before the last one - if len(categories) > 1: - simplified_entry = ", ".join(categories[:-1]) + " and " + categories[-1] + " collaboration" + for column in ["system", "domain", "organization_categorization", "approach"]: + tb[column] = tb[column].astype(str) + + # Function to categorize entries + def categorize(entry): + entries = entry.split(",") + if "Academia" in entry and "Industry" in entry: + return "Academia and industry collaboration" + elif all(e == "Academia" for e in entries): + return "Academia" + elif all(e == "Industry" for e in entries): + return "Industry" else: - simplified_entry = categories[0] - - return simplified_entry - - tb["organization_categorization"] = tb["organization_categorization"].apply(simplify_entry) - - # Get the unique values in the organization_categorization column and compare them to expected affiliations - unique_values = set(tb["organization_categorization"]) - expected_values = { - "Industry", - "Academia", - "Government", - "Academia and industry collaboration", - "Academia and research collective collaboration", - "Industry and research collective collaboration", - "Academia, industry and research collective collaboration", - "Government and industry collaboration", - "Research collective", - "Academia, government and industry collaboration", - "Academia and government collaboration", - "Academia, government, industry and research collective collaboration", - "Not specified", - } - assert unique_values == expected_values, "Unexpected affiliations in organization_categorization column" - - # Replace affiliation of researchers with less than 20 systems with 'Other' - affiliation_counts = tb["organization_categorization"].value_counts() - - tb["organization_categorization"] = tb["organization_categorization"].where( - tb["organization_categorization"].map(affiliation_counts) >= 20, "Other" - ) - # Get the organizations that were reclassified to 'Other' - reclassified_organizations = affiliation_counts[affiliation_counts < 20].index.tolist() - - paths.log.info( - f"Affiliations of researchers with less than 20 notable systems that were reclassified to 'Other': {', '.join(reclassified_organizations)}" - ) - - # Replace nans with Unspecified in each column to avoid issues when calculating sume of notable systems - columns = ["organization_categorization", "domain", "organization"] - tb[columns] = tb[columns].replace("nan", "Not specified") - - # Check for multiple entries in 'domain' separated by comma - multiple_domains = tb["domain"].str.contains(",") - # Replace entries in 'domain' that contain a comma with 'Multiple Domains' - tb.loc[multiple_domains, "domain"] = "Multiple domains" - - # Replace domains with less than 20 systems with 'Other' - domain_counts = tb["domain"].value_counts() + return "Other" - tb["domain"] = tb["domain"].where(tb["domain"].map(domain_counts) >= 20, "Other") - # Get the domains that were reclassified to 'Other' - reclassified_domains = domain_counts[domain_counts < 20].index.tolist() + # Apply categorize function to the column + tb["organization_categorization"] = tb["organization_categorization"].apply(categorize) + tb["organization_categorization"] = tb["organization_categorization"].astype(str) + + # Clean up system names + tb["system"] = tb["system"].replace({"Univeristy": "University", "Nvidia": "NVIDIA"}, regex=True) + + # There is a typo in the domain name "VIsion" + tb["domain"] = tb["domain"].replace({"VIsion": "Vision"}) + + # Ensure 'organization_categorization' and 'domain' + for column in ["organization_categorization", "domain", "organization", "approach"]: + tb[column] = tb[column].replace("nan", "Not specified") + # Find domains with total numbrt of notable systems below 10 + domain_counts = tb["domain"].value_counts() + domains_below_10 = domain_counts[domain_counts < 20].index.tolist() + log.info(f"Domains with less than 20 notable systems that were reclassified to Other: {domains_below_10}") + # Rename the domains with less than 10 notable systems to 'Other' + tb["domain"] = tb["domain"].apply(lambda x: "Other" if x in domains_below_10 else x) - paths.log.info( - f"Domains with less than 20 notable systems that were reclassified to 'Other': {', '.join(reclassified_domains)}" - ) # Convert FLOP to petaFLOP and remove the column with FLOPs (along with training time in hours) tb["training_computation_petaflop"] = tb["training_compute__flop"] / 1e15 - # Convert publication date to a datetime objects + # Convert publication date to a datetime objects tb["publication_date"] = pd.to_datetime(tb["publication_date"]) # Calculate 'days_since_1949' - tb["days_since_1949"] = (tb["publication_date"] - pd.to_datetime("1949-01-01")).dt.days.astype("Int64") + tb["days_since_1949"] = (tb["publication_date"] - pd.to_datetime("1949-01-01")).dt.days tb = tb.dropna(subset=["days_since_1949"]) tb = tb.reset_index(drop=True) + tb["days_since_1949"] = tb["days_since_1949"].astype(int) assert not tb[["system", "days_since_1949"]].isnull().any().any(), "Index columns should not have NaN values" # Drop columns that are not needed - tb = tb.drop( - ["training_compute__flop", "organization", "authors", "country__from_organization"], - axis=1, - ) - tb = tb.format(["days_since_1949", "system"]) + tb = tb.drop(["training_compute__flop", "training_time__hours", "organization"], axis=1) + tb = tb.set_index(["days_since_1949", "system"], verify_integrity=True).sort_index() # Add metadata to the publication date column tb["publication_date"].metadata.origins = tb["domain"].metadata.origins @@ -141,4 +96,4 @@ def simplify_entry(entry): # Save changes in the new garden dataset. ds_garden.save() - paths.log.info("epoch.end") + log.info("epoch.end")