-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCoffeeReading.py
177 lines (92 loc) · 6.89 KB
/
CoffeeReading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import os
import pandas as pd
import numpy as np
import json
from cleantext import clean
import re
from datetime import datetime
pd.set_option("display.max_columns", None)
#pd.set_option("display.max_rows", None)
os.makedirs("./Data", exist_ok=True)
def readDataset(fileName: str):
coffee = pd.read_csv(fileName, index_col=False, encoding="utf-8")
#print(coffee)
#print("Columns Data Types: \n", coffee.dtypes, "\n")
#print("General Description of the Dataset: \n", coffee.describe(), "\n")
#print("Number of Unique Values: \n", coffee.nunique(), "\n")
#print("Number of NAs: \n", coffee.isna().sum(), "\n")
return coffee
def convertAltitude(alt: str):
#print(alt)
if len(alt) > 4:
minAlt, maxAlt = re.split(r'\D+', alt)
minAlt = int(minAlt)
maxAlt = int(maxAlt)
#print(minAlt)
#print(maxAlt)
approxAlt = np.mean([minAlt, maxAlt])
return int(approxAlt)
else:
return int(alt)
def splitYear(year: str):
if len(year) > 4:
year = year[-4:]
else:
pass
if len(year) < 4: raise Exception("Wrong year characters length")
return year
def convertDateFormat(date: str):
#print(date)
date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date)
date = datetime.strptime(date.strip(), "%B %d, %Y") #Converting the new formatted date into a datetime object
newDate = date.strftime("%Y-%m-%d")
#print(newDate)
return newDate
def cleanBagWeight(bagWeight: str):
numbers = [c for c in bagWeight if c.isdigit()]
bagWeight = ''.join(numbers)
return bagWeight
def cleanData(fileName: str) -> pd.DataFrame:
coffee = pd.read_csv(fileName, index_col=False, encoding="utf-8")
coffee = coffee.drop(columns=["Unnamed: 0", "ICO Number", "Certification Body", "Certification Address", "Certification Contact"])
coffee = coffee.rename(columns={"Country of Origin": "Origin", "Farm Name": "Farm", "Lot Number": "Lot", "Number of Bags": "NBags",
"Bag Weight": "BagWeight", "Altitude": "ApproxAltitude", "In-Country Partner": "Partner", "Harvest Year": "HarvestYear",
"Processing Method": "ProcessingMethod", "Clean Cup": "CleanCup", "Total Cup Points": "CupPoints",
"Moisture Percentage": "Moisture", "Category One Defects": "C1Defects", "Category Two Defects": "C2Defects", "Grading Date": "GradingDate"})
coffee = coffee.dropna() #Dropping the rows which contain NAs
coffee["ApproxAltitude"] = coffee["ApproxAltitude"].apply(lambda x: convertAltitude(x)) #Averaging the altitude of the coffee cultivations
coffee["HarvestYear"] = coffee["HarvestYear"].apply(lambda x: splitYear(x)) #In case the harvest lasted longer than one year we're only interested in knowing the last one
coffee["Partner"] = coffee["Partner"].apply(lambda x: clean(x, no_punct=True, no_emoji=True, lower=False))
coffee["Owner"] = coffee["Owner"].apply(lambda x: clean(x, no_punct=True, no_emoji=True, lower=False))
coffee["Producer"] = coffee["Producer"].apply(lambda x: clean(x, no_punct=True, no_emoji=True, lower=False))
coffee["Company"] = coffee["Company"].apply(lambda x: clean(x, no_punct=True, no_emoji=True, lower=False))
coffee["Mill"] = coffee["Mill"].apply(lambda x: clean(x, no_punct=True, no_emoji=True, lower=False))
coffee["Farm"] = coffee["Farm"].apply(lambda x: clean(x, no_punct=True, no_emoji=True, lower=False))
countryNamesAndCode = pd.read_csv("countryCodesISO3166.csv", index_col=False, keep_default_na=False)
countryNames = dict(zip(countryNamesAndCode["countryName"], countryNamesAndCode["2LCode"]))
coffee["ISO3166A2"] = coffee["Origin"].apply(lambda x: countryNames[x]) #Adding the countries' ISO 3166 Alpha-2 codes
coffee["GradingDate"] = coffee["GradingDate"].apply(lambda x: convertDateFormat(x)) #Converting the various date variations into a standard format
coffee["Expiration"] = coffee["Expiration"].apply(lambda x: convertDateFormat(x)) #Converting the various date variations into a standard format
coffee["BagWeight"] = coffee["BagWeight"].apply(lambda x: cleanBagWeight(x)) #Removing the "kg" string from every weight
coffee["BagWeight"] = coffee["BagWeight"].astype("int32") #Changing the BagWeight column's data type
coffee["GradingDate"] = pd.to_datetime(coffee["GradingDate"]) #Converting to datetime data type the GradingDate column
coffee["Expiration"] = pd.to_datetime(coffee["Expiration"]) #Converting to datetime data type the Expiration column
coffee["ProcessingMethod"] = coffee["ProcessingMethod"].apply(lambda x: re.sub(r"[^\w\s]", ",", x.title())) #Standardizing the separator between the coffee's primary and secondary processing methods
coffee[["PrimaryProcessingMethod", "SecondaryProcessingMethod"]] = coffee["ProcessingMethod"].str.split(",", expand=True) #Dividing the two processing methods of the coffee (in case there's more than one) into primary and secondary
coffee["Color"] = coffee["Color"].apply(lambda x: re.sub(r"[^\w\s]", ",", x.title())) #Standardizing the separator between the coffee's primary and secondary colors
coffee[["PrimaryColor", "SecondaryColor"]] = coffee["Color"].str.split(",", expand=True) #Dividing the two colors of the coffee (in case there's more than one) into primary and secondary
#The .title() function standardizes the strings which describe the names of the processing methods or the coffee's colors by only capitalizing the first character of the string and lowering the rest
coffee = coffee.drop(columns=["ProcessingMethod", "Color"]) #Dropping old columns which contained multiple data in them
coffee[["PrimaryProcessingMethod", "SecondaryProcessingMethod", "PrimaryColor", "SecondaryColor"]].fillna(np.nan) #Just confirming that all NAs will be the Numpy version of it
coffee["Region"] = coffee["Region"].apply(lambda x: x.title()) #Standardizing the name of the region where the coffee comes from
coffee["Variety"] = coffee["Variety"].apply(lambda x: x.title()) #Standardizing the name of the coffee variety
coffee["Variety"] = coffee["Variety"].apply(lambda x: re.sub(r"[^\w\s]", ",", x))
coffee["Variety"] = coffee["Variety"].apply(lambda x: x.replace("Unknow", "Unknown"))
coffee["Variety"] = coffee["Variety"].apply(lambda x: x.replace("Unknownn", "Unknown"))
coffee["Variety"] = coffee["Variety"].apply(lambda x: x.replace(" , ", ","))
coffee["Variety"] = coffee["Variety"].apply(lambda x: x.replace(", ", ","))
coffee["Variety"] = coffee["Variety"].apply(lambda x: x.replace(" And ", ","))
return coffee
def exportCleanData(data: pd.DataFrame) -> None:
data.to_csv(f"{os.curdir}/Data/CoffeCleaned.csv")
return None