added cleaning scripts

ncavestany · May 16, 2024 · 09302ff · 09302ff
1 parent 4cf2280
commit 09302ff
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 0 deletions.
diff --git a/scripts/addYearRank.py b/scripts/addYearRank.py
@@ -0,0 +1,32 @@
+import pandas as pd
+import os
+
+def add_headers(csv_file):
+    # Check if the file exists
+    if not os.path.isfile(csv_file):
+        print("File does not exist.")
+        return
+
+    # Read the CSV file
+    df = pd.read_csv(csv_file)
+
+    # Check if any of the rank headers already exist
+    rank_headers = ['RANK', 'Happiness.Rank', 'Overall rank', 'Happiness Rank']
+    rank_header_exists = any(header in df.columns for header in rank_headers)
+
+    # Add "Year" column with the name of the file
+    year = os.path.basename(csv_file).split('.')[0]
+    df['Year'] = year
+
+    # Add "Rank" column if rank header doesn't exist
+    if not rank_header_exists:
+        df['Happiness Rank'] = df.index + 1
+
+    # Save the modified DataFrame back to CSV
+    output_file = f"{year}cleaned.csv"
+    df.to_csv(output_file, index=False)
+    print(f"Headers added successfully. Saved as {output_file}")
+
+# Example usage
+csv_file = "2022.csv"  # Provide the path to your CSV file here
+add_headers(csv_file)
diff --git a/scripts/cleanHeaders.py b/scripts/cleanHeaders.py
@@ -0,0 +1,30 @@
+import os
+import pandas as pd
+
+def process_csv_files(folder_path):
+    # Get list of CSV files in the folder
+    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
+
+    for file in csv_files:
+        file_path = os.path.join(folder_path, file)
+
+        # Read CSV file into a DataFrame
+        df = pd.read_csv(file_path)
+
+        # Task 1: Change row named United States to "United States of America"
+        df.loc[df['Country'] == 'United States', 'Country'] = 'United States of America'
+
+        # Task 2: Remove all periods from the column "Happiness Score"
+        df['Happiness Score'] = df['Happiness Score'].str.replace('.', '')
+
+        # Task 3: Add zeroes to make sure the Happiness Score is 4 digits
+        df['Happiness Score'] = df['Happiness Score'].apply(lambda x: x.zfill(4))
+
+        # Write the modified DataFrame back to the CSV file
+        df.to_csv(file_path, index=False)
+
+        print(f"Processed file: {file}")
+
+# Example usage:
+folder_path = '/test'
+process_csv_files(folder_path)
diff --git a/scripts/region.py b/scripts/region.py
@@ -0,0 +1,18 @@
+import pandas as pd
+
+def update_csv_with_region(csv_file_with_region, csv_file_to_update):
+    # Read both CSV files
+    df_region = pd.read_csv(csv_file_with_region)
+    df_to_update = pd.read_csv(csv_file_to_update)
+
+    # Merge the two dataframes on 'Country name'
+    merged_df = pd.merge(df_to_update, df_region[['Country', 'Region']], on='Country', how='left')
+
+    # Rename the column to 'Regional indicator'
+    merged_df.rename(columns={'Region': 'Region'}, inplace=True)
+
+    # Save the updated dataframe to a new CSV file
+    merged_df.to_csv('updated_' + csv_file_to_update, index=False)
+
+# Replace 'file_with_region.csv' and 'file_to_update.csv' with your file paths
+update_csv_with_region('2022cleaned.csv', '2017cleaned.csv')