Skip to content

Commit

Permalink
added cleaning scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
ncavestany committed May 16, 2024
1 parent 4cf2280 commit 09302ff
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 0 deletions.
32 changes: 32 additions & 0 deletions scripts/addYearRank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pandas as pd
import os

def add_headers(csv_file):
# Check if the file exists
if not os.path.isfile(csv_file):
print("File does not exist.")
return

# Read the CSV file
df = pd.read_csv(csv_file)

# Check if any of the rank headers already exist
rank_headers = ['RANK', 'Happiness.Rank', 'Overall rank', 'Happiness Rank']
rank_header_exists = any(header in df.columns for header in rank_headers)

# Add "Year" column with the name of the file
year = os.path.basename(csv_file).split('.')[0]
df['Year'] = year

# Add "Rank" column if rank header doesn't exist
if not rank_header_exists:
df['Happiness Rank'] = df.index + 1

# Save the modified DataFrame back to CSV
output_file = f"{year}cleaned.csv"
df.to_csv(output_file, index=False)
print(f"Headers added successfully. Saved as {output_file}")

# Example usage
csv_file = "2022.csv" # Provide the path to your CSV file here
add_headers(csv_file)
30 changes: 30 additions & 0 deletions scripts/cleanHeaders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
import pandas as pd

def process_csv_files(folder_path):
# Get list of CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

for file in csv_files:
file_path = os.path.join(folder_path, file)

# Read CSV file into a DataFrame
df = pd.read_csv(file_path)

# Task 1: Change row named United States to "United States of America"
df.loc[df['Country'] == 'United States', 'Country'] = 'United States of America'

# Task 2: Remove all periods from the column "Happiness Score"
df['Happiness Score'] = df['Happiness Score'].str.replace('.', '')

# Task 3: Add zeroes to make sure the Happiness Score is 4 digits
df['Happiness Score'] = df['Happiness Score'].apply(lambda x: x.zfill(4))

# Write the modified DataFrame back to the CSV file
df.to_csv(file_path, index=False)

print(f"Processed file: {file}")

# Example usage:
folder_path = '/test'
process_csv_files(folder_path)
18 changes: 18 additions & 0 deletions scripts/region.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd

def update_csv_with_region(csv_file_with_region, csv_file_to_update):
# Read both CSV files
df_region = pd.read_csv(csv_file_with_region)
df_to_update = pd.read_csv(csv_file_to_update)

# Merge the two dataframes on 'Country name'
merged_df = pd.merge(df_to_update, df_region[['Country', 'Region']], on='Country', how='left')

# Rename the column to 'Regional indicator'
merged_df.rename(columns={'Region': 'Region'}, inplace=True)

# Save the updated dataframe to a new CSV file
merged_df.to_csv('updated_' + csv_file_to_update, index=False)

# Replace 'file_with_region.csv' and 'file_to_update.csv' with your file paths
update_csv_with_region('2022cleaned.csv', '2017cleaned.csv')

0 comments on commit 09302ff

Please sign in to comment.