From 09302ffbe766070d64f1b3865e712eaae6824f99 Mon Sep 17 00:00:00 2001 From: Noah Cavestany Date: Wed, 15 May 2024 18:42:36 -0700 Subject: [PATCH] added cleaning scripts --- scripts/addYearRank.py | 32 ++++++++++++++++++++++++++++++++ scripts/cleanHeaders.py | 30 ++++++++++++++++++++++++++++++ scripts/region.py | 18 ++++++++++++++++++ 3 files changed, 80 insertions(+) create mode 100644 scripts/addYearRank.py create mode 100644 scripts/cleanHeaders.py create mode 100644 scripts/region.py diff --git a/scripts/addYearRank.py b/scripts/addYearRank.py new file mode 100644 index 0000000..9691d7f --- /dev/null +++ b/scripts/addYearRank.py @@ -0,0 +1,32 @@ +import pandas as pd +import os + +def add_headers(csv_file): + # Check if the file exists + if not os.path.isfile(csv_file): + print("File does not exist.") + return + + # Read the CSV file + df = pd.read_csv(csv_file) + + # Check if any of the rank headers already exist + rank_headers = ['RANK', 'Happiness.Rank', 'Overall rank', 'Happiness Rank'] + rank_header_exists = any(header in df.columns for header in rank_headers) + + # Add "Year" column with the name of the file + year = os.path.basename(csv_file).split('.')[0] + df['Year'] = year + + # Add "Rank" column if rank header doesn't exist + if not rank_header_exists: + df['Happiness Rank'] = df.index + 1 + + # Save the modified DataFrame back to CSV + output_file = f"{year}cleaned.csv" + df.to_csv(output_file, index=False) + print(f"Headers added successfully. Saved as {output_file}") + +# Example usage +csv_file = "2022.csv" # Provide the path to your CSV file here +add_headers(csv_file) diff --git a/scripts/cleanHeaders.py b/scripts/cleanHeaders.py new file mode 100644 index 0000000..f7d5365 --- /dev/null +++ b/scripts/cleanHeaders.py @@ -0,0 +1,30 @@ +import os +import pandas as pd + +def process_csv_files(folder_path): + # Get list of CSV files in the folder + csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')] + + for file in csv_files: + file_path = os.path.join(folder_path, file) + + # Read CSV file into a DataFrame + df = pd.read_csv(file_path) + + # Task 1: Change row named United States to "United States of America" + df.loc[df['Country'] == 'United States', 'Country'] = 'United States of America' + + # Task 2: Remove all periods from the column "Happiness Score" + df['Happiness Score'] = df['Happiness Score'].str.replace('.', '') + + # Task 3: Add zeroes to make sure the Happiness Score is 4 digits + df['Happiness Score'] = df['Happiness Score'].apply(lambda x: x.zfill(4)) + + # Write the modified DataFrame back to the CSV file + df.to_csv(file_path, index=False) + + print(f"Processed file: {file}") + +# Example usage: +folder_path = '/test' +process_csv_files(folder_path) diff --git a/scripts/region.py b/scripts/region.py new file mode 100644 index 0000000..d3ec26d --- /dev/null +++ b/scripts/region.py @@ -0,0 +1,18 @@ +import pandas as pd + +def update_csv_with_region(csv_file_with_region, csv_file_to_update): + # Read both CSV files + df_region = pd.read_csv(csv_file_with_region) + df_to_update = pd.read_csv(csv_file_to_update) + + # Merge the two dataframes on 'Country name' + merged_df = pd.merge(df_to_update, df_region[['Country', 'Region']], on='Country', how='left') + + # Rename the column to 'Regional indicator' + merged_df.rename(columns={'Region': 'Region'}, inplace=True) + + # Save the updated dataframe to a new CSV file + merged_df.to_csv('updated_' + csv_file_to_update, index=False) + +# Replace 'file_with_region.csv' and 'file_to_update.csv' with your file paths +update_csv_with_region('2022cleaned.csv', '2017cleaned.csv')