Skip to content

Commit bbe02f8

Browse files
authored
Add files via upload
1 parent 150cae1 commit bbe02f8

5 files changed

+388
-0
lines changed

Algorithm_test_harness.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
from random import randrange
2+
3+
# Split a dataset into a train and test set
4+
def train_test_split(dataset, split):
5+
train = list()
6+
train_size = split * len(dataset)
7+
dataset_copy = list(dataset)
8+
while len(train) < train_size:
9+
index = randrange(len(dataset_copy))
10+
train.append(dataset_copy.pop(index))
11+
return train, dataset_copy
12+
13+
# Split a dataset into $k$ folds
14+
def cross_validation_split(dataset, n_folds):
15+
dataset_split = list()
16+
dataset_copy = list(dataset)
17+
fold_size = int(len(dataset) / n_folds)
18+
for _ in range(n_folds):
19+
fold = list()
20+
while len(fold) < fold_size:
21+
index = randrange(len(dataset_copy))
22+
fold.append(dataset_copy.pop(index))
23+
dataset_split.append(fold)
24+
return dataset_split
25+
26+
# Evaluate an algorithm using a train/test split several times
27+
def evaluate_algorithm_tt_split(dataset, algorithm, split, n_splits, performance_assessment,*args):
28+
scores = list()
29+
for _ in range(n_splits):
30+
train, test = train_test_split(dataset, split)
31+
test_set = list()
32+
for row in test:
33+
row_copy = list(row)
34+
row_copy[-1] = None
35+
test_set.append(row_copy)
36+
predicted = algorithm(train, test_set, *args)
37+
actual = [row[-1] for row in test]
38+
performance = performance_assessment(actual, predicted)
39+
scores.append(performance)
40+
return scores
41+
42+
# Evaluate an algorithm using a cross-validation split
43+
def evaluate_algorithm_cv(dataset, algorithm, n_folds, performance_assessment, *args):
44+
folds = cross_validation_split(dataset, n_folds)
45+
scores = list()
46+
for fold in folds:
47+
train_set = list(folds)
48+
train_set.remove(fold)
49+
train_set = sum(train_set, [])
50+
test_set = list()
51+
for row in fold:
52+
row_copy = list(row)
53+
test_set.append(row_copy)
54+
row_copy[-1] = None
55+
predicted = algorithm(train_set, test_set, *args)
56+
actual = [row[-1] for row in fold]
57+
performance = performance_assessment(actual, predicted)
58+
scores.append(performance)
59+
return scores

Open_Conversion_Data.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import csv
2+
import random
3+
from math import sqrt
4+
5+
6+
## Load a CSV file
7+
def load_csv(filename):
8+
dataset = list()
9+
with open(filename, 'r') as file:
10+
csv_reader = csv.reader(file)
11+
for row in csv_reader:
12+
if not row:
13+
continue
14+
dataset.append(row)
15+
return dataset
16+
17+
## Convert string column to float
18+
def str_column_to_float(dataset, column):
19+
for row in dataset:
20+
row[column] = float(row[column].strip())
21+
22+
## Convert string column to integer
23+
def str_column_to_int(dataset, column):
24+
class_values = [row[column] for row in dataset]
25+
unique = set(class_values)
26+
lookup = dict()
27+
for i, value in enumerate(unique):
28+
lookup[value] = i
29+
for row in dataset:
30+
row[column] = lookup[row[column]]
31+
return lookup
32+
33+
34+
##### Normalize Data ###########
35+
36+
# Find the min and max values for each column
37+
38+
def dataset_minmax(dataset):
39+
minmax = list()
40+
for i in range(len(dataset[0])):
41+
colvalues = [row[i] for row in dataset]
42+
min_value = min(colvalues)
43+
max_value = max(colvalues)
44+
minmax.append([min_value, max_value])
45+
return minmax
46+
47+
# Normalize the dataset except last row for classification values
48+
def Normalize_Dataset(dataset, minmax):
49+
for row in dataset:
50+
for i in range(len(row)-1):
51+
row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
52+
53+
54+
#### Standardize Data ######
55+
56+
# calculate column means
57+
def column_means(dataset):
58+
means = [0 for i in range(len(dataset[0]))]
59+
for i in range(len(dataset[0])):
60+
col_values = [row[i] for row in dataset]
61+
means[i] = sum(col_values) / float(len(dataset))
62+
return means
63+
64+
# calculate column standard deviations
65+
def column_stdevs(dataset, means):
66+
stdevs = [0 for i in range(len(dataset[0]))]
67+
for i in range(len(dataset[0])):
68+
variance = [pow(row[i]-means[i], 2) for row in dataset]
69+
stdevs[i] = sum(variance)
70+
stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
71+
return stdevs
72+
73+
# Standardize the dataset
74+
def Standardize_Dataset(dataset, means, stdevs):
75+
for row in dataset:
76+
for i in range(len(row)):
77+
row[i] = (row[i] - means[i]) / stdevs[i]
78+
79+

Performance_assessment.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
### Methods to assess accuracy of prediction ####
2+
from math import sqrt
3+
4+
####### Accuracy for classification problems ######
5+
6+
# Get accuracy of prediction #
7+
def getAccuracy(actual,predicted):
8+
correct = 0
9+
for x in range(len(actual)):
10+
if actual[x] == predicted[x]:
11+
correct += 1
12+
return correct / float(len(actual)) * 100.00
13+
14+
# Calculate a Confusion Matrix #
15+
def confusion_matrix(actual, predicted):
16+
unique = set(actual)
17+
matrix = [list() for x in range(len(unique))]
18+
for i in range(len(unique)):
19+
matrix[i] = [0 for x in range(len(unique))]
20+
lookup = dict()
21+
for i, value in enumerate(unique):
22+
lookup[value] = i
23+
for i in range(len(actual)):
24+
x = lookup[actual[i]]
25+
y = lookup[predicted[i]]
26+
matrix[x][y] += 1
27+
return unique, matrix
28+
29+
# Printing a confusion matrix
30+
def print_confusion_matrix(unique, matrix):
31+
print('Unique prediction values:')
32+
print('(P)' + ' '.join(str(x) for x in unique))
33+
print('(A)---')
34+
print("Confusion Matrix:")
35+
for i, x in enumerate(unique):
36+
print("%s| %s" % (x, ' '.join(str(x) for x in matrix[i])))
37+
38+
# Recall classification estimator
39+
def recall_precision_calc(matrix):
40+
for i in range(len(matrix[0])):
41+
row_values = matrix[i] # row values of matrix
42+
col_values = [row[i] for row in matrix] # column values of matrix
43+
tp = col_values[i]
44+
fp = sum(row_values)-row_values[i] # sum all row values - ones in diagonal
45+
fn = sum(col_values)-col_values[i] # sum all col values - ones in diagonal
46+
47+
recall = tp / (tp + fn)
48+
precision = tp / (tp + fp)
49+
50+
F1_score = 2 * (precision * recall) / (precision + recall)
51+
52+
return recall, precision, F1_score
53+
54+
55+
###### Accuracy methods for Regression problems ########
56+
57+
# Calculate mean absolute error (MAE) #
58+
def mae_metric(actual, predicted):
59+
sum_error = 0.0
60+
for i in range(len(actual)):
61+
sum_error += abs(predicted[i] - actual[i])
62+
return sum_error / float(len(actual))
63+
64+
# Calculate root mean squared error #
65+
def rmse_metric(actual, predicted):
66+
sum_error = 0.0
67+
for i in range(len(actual)):
68+
prediction_error = predicted[i] - actual[i]
69+
sum_error += (prediction_error ** 2)
70+
mean_error = sum_error / float(len(actual))
71+
return sqrt(mean_error)
72+

Sonar_case_study.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#Import and Conversion, Normalization of Data
2+
from Open_Conversion_Data import load_csv
3+
from Open_Conversion_Data import str_column_to_float
4+
from Open_Conversion_Data import str_column_to_int
5+
6+
#Algorithm evaluation with different steps
7+
from Algorithm_test_harness import evaluate_algorithm_cv
8+
9+
#
10+
from Performance_assessment import getAccuracy
11+
12+
#Import math/random functions
13+
from math import sqrt
14+
from random import seed
15+
16+
#Import random forest tree model
17+
from Tree_model_RF import random_forest
18+
19+
def main():
20+
# Test the random forest algorithm on sonar dataset
21+
seed(2)
22+
# load and prepare data
23+
filename = 'sonar-all-data.csv'
24+
dataset = load_csv(filename)
25+
# convert string attributes to integers
26+
for i in range(0, len(dataset[0])-1):
27+
str_column_to_float(dataset, i)
28+
# convert class column to integers
29+
str_column_to_int(dataset, len(dataset[0])-1)
30+
# evaluate algorithm
31+
n_folds = 5
32+
max_depth = 10
33+
min_size = 1.0
34+
sample_size = 1.0
35+
n_features = int(sqrt(len(dataset[0])-1))
36+
for n_trees in [1, 5, 10]:
37+
scores = evaluate_algorithm_cv(dataset, random_forest, n_folds, getAccuracy,max_depth, min_size, sample_size, n_trees, n_features)
38+
print('Trees: %d' % n_trees)
39+
print('Scores: %s' % scores)
40+
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
41+
42+
main()

Tree_model_RF.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
from random import randrange
2+
3+
4+
# Create a random subsample from the dataset with replacement
5+
def subsample(dataset, ratio):
6+
sample = list()
7+
n_sample = round(len(dataset) * ratio)
8+
while len(sample) < n_sample:
9+
index = randrange(len(dataset))
10+
sample.append(dataset[index])
11+
return sample
12+
13+
# GINI index as cost function to minimize
14+
def gini_index(groups,classes):
15+
#count all samples at split point
16+
n_instances = float(sum([len(group) for group in groups]))
17+
#Gini indexes weighted sum
18+
Gini = 0.0
19+
for group in groups:
20+
size = float(len(group))
21+
if size == 0.0:
22+
continue
23+
score = 0.0
24+
#score group based on score of each class
25+
for class_val in classes:
26+
p = [row[-1] for row in group].count(class_val) / size
27+
score += p*p
28+
#weight group score b yrelative size
29+
Gini += (1.0-score) * (size / n_instances)
30+
return Gini
31+
32+
#Split a dataset
33+
def test_split(index,value,dataset):
34+
left, right = list(), list()
35+
for row in dataset:
36+
if row[index]< value:
37+
left.append(row)
38+
else:
39+
right.append(row)
40+
return left, right
41+
42+
#Best splitpoint for dataset with choice of attributes to avoid redundancy
43+
def get_split_forest(dataset, n_attributes):
44+
class_values = list(set(row[-1] for row in dataset))
45+
b_index, b_value, b_score, b_groups = 999, 999, 999, None
46+
attributes = list()
47+
while len(attributes) < n_attributes:
48+
index = randrange(len(dataset[0])-1)
49+
if index not in attributes:
50+
attributes.append(index)
51+
for index in attributes:
52+
for row in dataset:
53+
groups = test_split(index, row[index], dataset)
54+
Gini = gini_index(groups, class_values)
55+
#print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], Gini))
56+
if Gini < b_score:
57+
b_index, b_value, b_score, b_groups = index, row[index], Gini, groups
58+
return {'index':b_index, 'value':b_value, 'groups':b_groups}
59+
60+
# Create a terminal node value
61+
def to_terminal(group):
62+
outcomes = [row[-1] for row in group]
63+
return max(set(outcomes), key=outcomes.count)
64+
65+
# Create a function to split a node in function of diffrent parameters
66+
def split_node_forest(node, max_depth, min_size, n_attributes, depth):
67+
left, right = node['groups']
68+
#Delete data from node as it is no longer needed
69+
del(node['groups'])
70+
#Check whether left node or right node is empty to create a terminal node
71+
if not left or not right:
72+
node['left'] = node['right'] = to_terminal(left + right)
73+
return
74+
#Check if we have reached the maximum depth of the tree --> Create terminal node
75+
if depth >= max_depth:
76+
node['left'], node['right'] = to_terminal(left), to_terminal(right)
77+
return
78+
#Process on the both children, by checking if min size is reached first or further split of tree is required
79+
if len(left) <= min_size:
80+
node['left'] = to_terminal(left)
81+
else:
82+
node['left'] = get_split_forest(left, n_attributes)
83+
split_node_forest(node['left'], max_depth, min_size, n_attributes,depth+1)
84+
#Right child
85+
if len(right) <= min_size:
86+
node['right'] = to_terminal(right)
87+
else:
88+
node['right'] = get_split_forest(right, n_attributes)
89+
split_node_forest(node['right'], max_depth, min_size, n_attributes, depth+1)
90+
91+
# Build a decision tree
92+
def build_tree(train, max_depth, n_attributes ,min_size):
93+
root = get_split_forest(train, n_attributes)
94+
split_node_forest(root, max_depth, min_size, n_attributes ,1)
95+
return root
96+
97+
# Print a decision tree
98+
def print_tree(node, depth=0):
99+
if isinstance(node, dict):
100+
print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value'])))
101+
print_tree(node['left'], depth+1)
102+
print_tree(node['right'], depth+1)
103+
else:
104+
print('%s[%s]' % ((depth*' ', node)))
105+
106+
# Make a prediction with a decision tree
107+
def predict(node, row):
108+
if row[node['index']] < node['value']:
109+
if isinstance(node['left'], dict):
110+
return predict(node['left'], row)
111+
else:
112+
return node['left']
113+
else:
114+
if isinstance(node['right'], dict):
115+
return predict(node['right'], row)
116+
else:
117+
return node['right']
118+
119+
# Make a prediction with a list of bagged trees
120+
def bagging_predict(trees, row):
121+
predictions = [predict(tree, row) for tree in trees]
122+
return max(set(predictions), key=predictions.count)
123+
124+
125+
126+
# Random Forest Algorithm
127+
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
128+
trees = list()
129+
for _ in range(n_trees):
130+
sample = subsample(train, sample_size)
131+
tree = build_tree(sample, max_depth, min_size, n_features)
132+
trees.append(tree)
133+
predictions = [bagging_predict(trees, row) for row in test]
134+
return(predictions)
135+
136+

0 commit comments

Comments
 (0)