forked from bnsreenu/python_for_microscopists
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path069a-Train_BOVW_V1.0.py
113 lines (86 loc) · 3.88 KB
/
069a-Train_BOVW_V1.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python
__author__ = "Sreenivas Bhattiprolu"
__license__ = "Feel free to copy, I appreciate if you acknowledge Python for Microscopists"
# https://www.youtube.com/watch?v=PRceoMWcv1U
"""
All cell images resized to 128 x 128
Images used for test are completely different that the ones used for training.
136 images for testing, each parasitized and uninfected (136 x 2)
104 images for training, each parasitized and uninfected (104 x 2)
Cannot import lots of data to Github, so uploaded 10 images of each.
Download full dataset from: ftp://lhcftp.nlm.nih.gov/Open-Access-Datasets/Malaria/cell_images.zip
"""
import cv2
import numpy as np
import os
# Get the training classes names and store them in a list
#Here we use folder names for class names
#train_path = 'dataset/train' # Names are Aeroplane, Bicycle, Car
train_path = 'images/cell_images/train' # Folder Names are Parasitized and Uninfected
training_names = os.listdir(train_path)
# Get path to all images and save them in a list
# image_paths and the corresponding label in image_paths
image_paths = []
image_classes = []
class_id = 0
#To make it easy to list all file names in a directory let us define a function
#
def imglist(path):
return [os.path.join(path, f) for f in os.listdir(path)]
#Fill the placeholder empty lists with image path, classes, and add class ID number
#
for training_name in training_names:
dir = os.path.join(train_path, training_name)
class_path = imglist(dir)
image_paths+=class_path
image_classes+=[class_id]*len(class_path)
class_id+=1
# Create feature extraction and keypoint detector objects
#SIFT is not available anymore in openCV
# Create List where all the descriptors will be stored
des_list = []
#BRISK is a good replacement to SIFT. ORB also works but didn;t work well for this example
brisk = cv2.BRISK_create(30)
for image_path in image_paths:
im = cv2.imread(image_path)
kpts, des = brisk.detectAndCompute(im, None)
des_list.append((image_path, des))
# Stack all the descriptors vertically in a numpy array
descriptors = des_list[0][1]
for image_path, descriptor in des_list[1:]:
descriptors = np.vstack((descriptors, descriptor))
#kmeans works only on float, so convert integers to float
descriptors_float = descriptors.astype(float)
# Perform k-means clustering and vector quantization
from scipy.cluster.vq import kmeans, vq
k = 200 #k means with 100 clusters gives lower accuracy for the aeroplane example
voc, variance = kmeans(descriptors_float, k, 1)
# Calculate the histogram of features and represent them as vector
#vq Assigns codes from a code book to observations.
im_features = np.zeros((len(image_paths), k), "float32")
for i in range(len(image_paths)):
words, distance = vq(des_list[i][1],voc)
for w in words:
im_features[i][w] += 1
# Perform Tf-Idf vectorization
nbr_occurences = np.sum( (im_features > 0) * 1, axis = 0)
idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')
# Scaling the words
#Standardize features by removing the mean and scaling to unit variance
#In a way normalization
from sklearn.preprocessing import StandardScaler
stdSlr = StandardScaler().fit(im_features)
im_features = stdSlr.transform(im_features)
#Train an algorithm to discriminate vectors corresponding to positive and negative training images
# Train the Linear SVM
from sklearn.svm import LinearSVC
clf = LinearSVC(max_iter=10000) #Default of 100 is not converging
clf.fit(im_features, np.array(image_classes))
#Train Random forest to compare how it does against SVM
#from sklearn.ensemble import RandomForestClassifier
#clf = RandomForestClassifier(n_estimators = 100, random_state=30)
#clf.fit(im_features, np.array(image_classes))
# Save the SVM
#Joblib dumps Python object into one file
from sklearn.externals import joblib
joblib.dump((clf, training_names, stdSlr, k, voc), "bovw.pkl", compress=3)