-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis.py
158 lines (140 loc) · 6.75 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
def summary_to_file():
# idea for sys.stdout: https://stackoverflow.com/questions/3263672/the-difference-between-sys-stdout-write-and-print
sys.stdout = open ("analysis_summary.txt","w")
print ("\n")
print ("==============================================================================")
print ("Overview of the whole dataset:")
print ("\n")
print(ifds)
print ("\n")
print ("==============================================================================")
print ("Summary of numeric values: ")
print ("\n")
# reference for .describe() and .info(): https://towardsdatascience.com/getting-started-to-data-analysis-with-python-pandas-with-titanic-dataset-a195ab043c77
print (ifds.describe())
print ("\n")
print ("==============================================================================")
print ("Number of samples of each type:")
print ("\n")
print (ifds.info())
print ("\n")
print ("==============================================================================")
print("Number of occurances of each of the species:")
print ("\n")
# reference for idea: https://medium.com/@avulurivenkatasaireddy/exploratory-data-analysis-of-iris-data-set-using-python-823e54110d2d
print (ifds["Species"].value_counts())
print ("\n")
print(" In percentile:")
print("\n")
# reference for normalize=True: https://towardsdatascience.com/getting-more-value-from-the-pandas-value-counts-aa17230907a6
print (((ifds["Species"].value_counts(normalize=True))*100))
print("\n")
print ("==============================================================================")
sys.stdout.close()
# function for plotting histogram for sepal length
def sepal_length_hist():
# reference for figsize: https://stackoverflow.com/questions/332289/how-do-you-change-the-size-of-figures-drawn-with-matplotlib
plt.figure(figsize = (9,9))
# reference for plotting multiple histograms on one plot with different colors:
# https://cmdlinetips.com/2019/02/how-to-make-histogram-in-python-with-pandas-and-seaborn/
sns.distplot(iris_s["SepalLengthCm"], kde = False, label = "Iris setosa", color = "deeppink", )
sns.distplot(iris_vers["SepalLengthCm"], kde = False, label = "Iris versicolor", color = "mediumorchid")
sns.distplot(iris_virg["SepalLengthCm"], kde = False, label = "Iris virginica", color = "navy")
plt.title("Sepal length in cm", size = 20)
plt.xlabel("")
plt.ylabel("Frequency", size = 16)
plt.legend()
plt.savefig("Sepal-lenght.png")
plt.show()
#function for plotting a histogram for sepal width
def sepal_width_hist():
plt.figure(figsize = (9,9))
sns.distplot(iris_s["SepalWidthCm"], kde = False, label = "Iris setosa", color = "deeppink")
sns.distplot(iris_vers["SepalWidthCm"], kde=False, label = "Iris versicolor", color = "mediumorchid")
sns.distplot(iris_virg["SepalWidthCm"], kde=False, label = "Iris virginica", color = "navy")
plt.title("Sepal width in cm", size = 20)
plt.xlabel("")
plt.ylabel("Frequency", size = 16)
plt.legend()
plt.savefig("Sepal-width.png")
plt.show()
# function for plotting a histogram for petal length
def petal_length_hist():
plt.figure(figsize = (9,9))
sns.distplot(iris_s["PetalLengthCm"], kde = False, label = "Iris setosa", color = "deeppink")
sns.distplot(iris_vers["PetalLengthCm"], kde = False, label = "Iris versicolor", color = "mediumorchid")
sns.distplot(iris_virg["PetalLengthCm"], kde = False, label = "Iris virginica", color = "navy")
plt.title("Petal length in cm", size = 20)
plt.xlabel("")
plt.ylabel("Frequency", size = 16)
plt.legend()
plt.savefig("Petal-lenght.png")
plt.show()
# function for plotting a histogram for petal width
def petal_width_hist():
plt.figure(figsize = (9,9))
sns.distplot(iris_s["PetalWidthCm"], kde = False, label = "Iris setosa", color = "deeppink")
sns.distplot(iris_vers["PetalWidthCm"], kde = False, label = "Iris versicolor", color = "mediumorchid")
sns.distplot(iris_virg["PetalWidthCm"], kde = False, label = "Iris virginica", color = "navy")
plt.title("Petal width in cm", size = 20)
plt.xlabel("")
plt.ylabel("Frequency", size = 16)
plt.legend()
plt.savefig("Petal-width.png")
plt.show()
# function for uniting all the functions for creating histograms
def histograms():
sepal_length_hist()
sepal_width_hist()
petal_length_hist()
petal_width_hist()
# function for plotting a scatterplot for sepal length and width
def sepal_length_width_scat():
plt.figure(figsize = (9,9))
# reference for scatterplot: https://honingds.com/blog/seaborn-scatterplot/
# https://seaborn.pydata.org/generated/seaborn.scatterplot.html
# reference for marker: https://matplotlib.org/3.1.1/api/markers_api.html
# reference for scatter outline: https://seaborn.pydata.org/generated/seaborn.scatterplot.html
# https://stackoverflow.com/questions/50706901/matplotlib-border-around-scatter-plot-points
sns.scatterplot(x = "SepalLengthCm", y = "SepalWidthCm", data = ifds, marker = "o", hue = "Species", palette = ["deeppink","mediumorchid","navy"], edgecolor = "dimgrey")
plt.title("Sepal length and Sepal width comparison", size = 20)
plt.xlabel("Sepal length", size = 16)
plt.ylabel("Sepal width", size = 16)
plt.legend()
plt.savefig("Sepal-length-width.png")
plt.show()
# function for plotting a scatterplot for petal length and width
def petal_length_width_scat():
plt.figure(figsize = (9,9))
sns.scatterplot(x = "PetalLengthCm", y = "PetalWidthCm", data = ifds, marker = "o", hue = "Species", palette = ["deeppink","mediumorchid","navy"], edgecolor = "dimgrey")
plt.title("Petal length and Petal width comparison", size = 20)
plt.xlabel("Petal length", size = 16)
plt.ylabel("Petal width", size = 16)
plt.legend()
plt.savefig("Petal-length-width.png")
plt.show()
# function for uniting all the functions for creating scatterplots
def scatterplots():
sepal_length_width_scat()
petal_length_width_scat()
# function for plotting pairplot
def pairplot():
sns.pairplot(ifds, hue = "Species", diag_kind = "hist", palette = ["deeppink","mediumorchid","navy"])
plt.savefig("Iris-dataset-pairplot.png")
plt.show()
# main part of the code
# index_col = "Id" was used to make the Id column an index column
# reference for index_col: https://realpython.com/python-csv/
ifds = pd.read_csv("Iris_dataset.csv", index_col = "Id")
summary_to_file()
iris_s = ifds[ifds.Species == "Iris-setosa"]
iris_vers = ifds[ifds.Species == "Iris-versicolor"]
iris_virg = ifds[ifds.Species == "Iris-virginica"]
histograms()
scatterplots()
pairplot()