-
Notifications
You must be signed in to change notification settings - Fork 12
/
PCA_Automobile_sklearn.py
108 lines (89 loc) · 3.28 KB
/
PCA_Automobile_sklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
## Automobile Scikit Learn Version
#
# Author: David Lee
# Create Date: 2018/11/2
#
# Detail:
# Total Data = 205
#
# PS. only Dimensionality Reduction
import numpy as np
import pandas as pd # Read csv
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder # Transform 'string' into class number
def loadData(path):
inputData = pd.read_csv(path)
Make_list = ['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
'isuzu', 'jaguar', 'mazda', 'mercedes-benz', 'mercury',
'mitsubishi', 'nissan', 'peugot', 'plymouth', 'porsche',
'renault', 'saab', 'subaru', 'toyota', 'volkswagen', 'volvo']
# Fill missing value in column "normalized-losses" as its own types car's mean
for make in Make_list:
mean_normalized_losses = np.mean(inputData[inputData['make'] == make]['normalized-losses'])
inputData.update(inputData[inputData['make'] == make]['normalized-losses'].fillna(mean_normalized_losses))
# Drop others whatever contain NA
inputData = inputData.dropna()
# Transform 'string' into class number
Labels = [
[],
[],
Make_list,
['diesel', 'gas'],
['std', 'turbo'],
['four', 'two'],
['hardtop', 'wagon', 'sedan', 'hatchback', 'convertible'],
['4wd', 'fwd', 'rwd'],
['front', 'rear'],
[],
[],
[],
[],
[],
['dohc', 'dohcv', 'l', 'ohc', 'ohcf', 'ohcv', 'rotor'],
['eight', 'five', 'four', 'six', 'three', 'twelve', 'two'],
[],
['1bbl', '2bbl', '4bbl', 'idi', 'mfi', 'mpfi', 'spdi', 'spfi'],
[],
[],
[],
[],
[],
[],
[],
[]
]
le = LabelEncoder()
dataTemp = np.mat(np.zeros((len(inputData), len(inputData.columns))))
for colIdx in range(len(inputData.columns)):
if Labels[colIdx]:
le.fit(Labels[colIdx])
dataTemp[:, colIdx] = np.mat(le.transform(inputData.iloc[:, colIdx])).T
else:
dataTemp[:, colIdx] = np.mat(inputData.iloc[:, colIdx]).T
return dataTemp
def tryComponentNumber(data, max_principlal_component):
pca = PCA(n_components=max_principlal_component)
pca.fit(data)
return pca.explained_variance_ratio_
def plotPCAVariance(PCNumbers, percentage_of_variance):
plt.title('Percentage of total variance contained in the first %d principal components' % PCNumbers)
plt.xlabel('Principal Component Number')
plt.ylabel('Percentage of Variance')
plt.grid(True)
plt.xticks(range(1, PCNumbers+1))
plt.plot(np.arange(1, PCNumbers+1), percentage_of_variance*100)
def main():
# Load Data
data = loadData('Datasets/imports-85.csv')
PCNumbers = 20
ShowFirstPC = 7
# Train Model
percentage_of_variance = tryComponentNumber(data, PCNumbers)
print('%% Variance for the first %d principle component:\n' % ShowFirstPC, np.round(percentage_of_variance*100, 2))
print('%% Cumulative for the first %d principle component:\n' % ShowFirstPC, [sum(np.round(percentage_of_variance*100, 2)[:i]) for i in range(PCNumbers)])
# Plot PC number vs. Variance
plotPCAVariance(PCNumbers, percentage_of_variance)
plt.show()
if __name__ == '__main__':
main()