Add files via upload

DavidCico · web-flow · commit bbe02f8b0c28 · 2019-02-11T17:11:14.000+08:00
diff --git a/Algorithm_test_harness.py b/Algorithm_test_harness.py
@@ -0,0 +1,59 @@
+from random import randrange
+
+# Split a dataset into a train and test set
+def train_test_split(dataset, split):
+    train = list()
+    train_size = split * len(dataset)
+    dataset_copy = list(dataset)
+    while len(train) < train_size:
+        index = randrange(len(dataset_copy))
+        train.append(dataset_copy.pop(index))
+    return train, dataset_copy
+
+# Split a dataset into $k$ folds
+def cross_validation_split(dataset, n_folds):
+    dataset_split = list()
+    dataset_copy = list(dataset)
+    fold_size = int(len(dataset) / n_folds)
+    for _ in range(n_folds):
+        fold = list()
+        while len(fold) < fold_size:
+            index = randrange(len(dataset_copy))
+            fold.append(dataset_copy.pop(index))
+        dataset_split.append(fold)
+    return dataset_split
+
+# Evaluate an algorithm using a train/test split several times
+def evaluate_algorithm_tt_split(dataset, algorithm, split, n_splits, performance_assessment,*args):
+    scores = list()
+    for _ in range(n_splits):
+        train, test = train_test_split(dataset, split)
+        test_set = list()
+        for row in test:
+            row_copy = list(row)
+            row_copy[-1] = None
+            test_set.append(row_copy)
+        predicted = algorithm(train, test_set, *args)
+        actual = [row[-1] for row in test]
+        performance = performance_assessment(actual, predicted)
+        scores.append(performance)
+    return scores
+
+# Evaluate an algorithm using a cross-validation split
+def evaluate_algorithm_cv(dataset, algorithm, n_folds, performance_assessment, *args):
+    folds = cross_validation_split(dataset, n_folds)
+    scores = list()
+    for fold in folds:
+        train_set = list(folds)
+        train_set.remove(fold)
+        train_set = sum(train_set, [])
+        test_set = list()
+        for row in fold:
+            row_copy = list(row)
+            test_set.append(row_copy)
+            row_copy[-1] = None
+        predicted = algorithm(train_set, test_set, *args)
+        actual = [row[-1] for row in fold]
+        performance = performance_assessment(actual, predicted)
+        scores.append(performance)
+    return scores
diff --git a/Open_Conversion_Data.py b/Open_Conversion_Data.py
@@ -0,0 +1,79 @@
+import csv
+import random
+from math import sqrt
+
+
+## Load a CSV file
+def load_csv(filename):
+	dataset = list()
+	with open(filename, 'r') as file:
+		csv_reader = csv.reader(file)
+		for row in csv_reader:
+			if not row:
+				continue
+			dataset.append(row)
+	return dataset
+
+## Convert string column to float
+def str_column_to_float(dataset, column):
+	for row in dataset:
+		row[column] = float(row[column].strip())
+
+## Convert string column to integer
+def str_column_to_int(dataset, column):
+	class_values = [row[column] for row in dataset]
+	unique = set(class_values)
+	lookup = dict()
+	for i, value in enumerate(unique):
+		lookup[value] = i
+	for row in dataset:
+		row[column] = lookup[row[column]]
+	return lookup
+
+
+##### Normalize Data ###########
+
+# Find the min and max values for each column
+
+def dataset_minmax(dataset):
+	minmax = list()
+	for i in range(len(dataset[0])):
+		colvalues = [row[i] for row in dataset]
+		min_value = min(colvalues) 
+		max_value = max(colvalues)
+		minmax.append([min_value, max_value])
+	return minmax
+
+# Normalize the dataset except last row for classification values
+def Normalize_Dataset(dataset, minmax):
+	for row in dataset:
+		for i in range(len(row)-1):
+			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
+
+
+#### Standardize Data ######
+
+# calculate column means
+def column_means(dataset):
+	means = [0 for i in range(len(dataset[0]))]
+	for i in range(len(dataset[0])):
+		col_values = [row[i] for row in dataset]
+		means[i] = sum(col_values) / float(len(dataset))
+	return means
+
+# calculate column standard deviations
+def column_stdevs(dataset, means):
+	stdevs = [0 for i in range(len(dataset[0]))]
+	for i in range(len(dataset[0])):
+		variance = [pow(row[i]-means[i], 2) for row in dataset]
+		stdevs[i] = sum(variance)
+		stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
+	return stdevs
+
+# Standardize the dataset
+def Standardize_Dataset(dataset, means, stdevs):
+	for row in dataset:
+		for i in range(len(row)):
+			row[i] = (row[i] - means[i]) / stdevs[i]
+
+
diff --git a/Performance_assessment.py b/Performance_assessment.py
@@ -0,0 +1,72 @@
+### Methods to assess accuracy of prediction ####
+from math import sqrt
+
+####### Accuracy for classification problems ######
+
+# Get accuracy of prediction #
+def getAccuracy(actual,predicted):
+	correct = 0
+	for x in range(len(actual)):
+		if actual[x] == predicted[x]:
+			correct += 1
+	return correct / float(len(actual)) * 100.00
+
+# Calculate a Confusion Matrix #	
+def confusion_matrix(actual, predicted):
+	unique = set(actual)
+	matrix = [list() for x in range(len(unique))]
+	for i in range(len(unique)):
+		matrix[i] = [0 for x in range(len(unique))]
+	lookup = dict()
+	for i, value in enumerate(unique):
+		lookup[value] = i
+	for i in range(len(actual)):
+		x = lookup[actual[i]]
+		y = lookup[predicted[i]]
+		matrix[x][y] += 1
+	return unique, matrix
+
+# Printing a confusion matrix
+def print_confusion_matrix(unique, matrix):
+	print('Unique prediction values:')
+	print('(P)' + ' '.join(str(x) for x in unique))
+	print('(A)---')
+	print("Confusion Matrix:")
+	for i, x in enumerate(unique):
+		print("%s| %s" % (x, ' '.join(str(x) for x in matrix[i])))
+
+# Recall classification estimator 
+def recall_precision_calc(matrix):
+    for i in range(len(matrix[0])):
+        row_values = matrix[i] # row values of matrix
+        col_values = [row[i] for row in matrix] # column values of matrix
+        tp = col_values[i]
+        fp = sum(row_values)-row_values[i] # sum all row values - ones in diagonal
+        fn = sum(col_values)-col_values[i] # sum all col values - ones in diagonal
+    
+    recall = tp / (tp + fn)
+    precision = tp / (tp + fp)
+    
+    F1_score = 2 * (precision * recall) / (precision + recall)
+    
+    return recall, precision, F1_score
+	
+
+###### Accuracy methods for Regression problems ########
+
+# Calculate mean absolute error (MAE) #
+def mae_metric(actual, predicted):
+	sum_error = 0.0
+	for i in range(len(actual)):
+		sum_error += abs(predicted[i] - actual[i])
+	return sum_error / float(len(actual))
+
+# Calculate root mean squared error #
+def rmse_metric(actual, predicted):
+	sum_error = 0.0
+	for i in range(len(actual)):
+		prediction_error = predicted[i] - actual[i]
+		sum_error += (prediction_error ** 2)
+	mean_error = sum_error / float(len(actual))
+	return sqrt(mean_error)
+
diff --git a/Sonar_case_study.py b/Sonar_case_study.py
@@ -0,0 +1,42 @@
+#Import and Conversion, Normalization of Data
+from Open_Conversion_Data import load_csv
+from Open_Conversion_Data import str_column_to_float
+from Open_Conversion_Data import str_column_to_int
+
+#Algorithm evaluation with different steps 
+from Algorithm_test_harness import evaluate_algorithm_cv
+
+#
+from Performance_assessment import getAccuracy
+
+#Import math/random functions
+from math import sqrt
+from random import seed
+
+#Import random forest tree model
+from Tree_model_RF import random_forest
+
+def main():
+    # Test the random forest algorithm on sonar dataset
+    seed(2)
+    # load and prepare data
+    filename = 'sonar-all-data.csv'
+    dataset = load_csv(filename)
+    # convert string attributes to integers
+    for i in range(0, len(dataset[0])-1):
+        str_column_to_float(dataset, i)
+    # convert class column to integers
+    str_column_to_int(dataset, len(dataset[0])-1)
+    # evaluate algorithm
+    n_folds = 5
+    max_depth = 10
+    min_size = 1.0
+    sample_size = 1.0
+    n_features = int(sqrt(len(dataset[0])-1))
+    for n_trees in [1, 5, 10]:
+        scores = evaluate_algorithm_cv(dataset, random_forest, n_folds, getAccuracy,max_depth, min_size, sample_size, n_trees, n_features)
+        print('Trees: %d' % n_trees)
+        print('Scores: %s' % scores)
+        print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
+
+main()
diff --git a/Tree_model_RF.py b/Tree_model_RF.py
@@ -0,0 +1,136 @@
+from random import randrange
+
+
+# Create a random subsample from the dataset with replacement
+def subsample(dataset, ratio):
+    sample = list()
+    n_sample = round(len(dataset) * ratio)
+    while len(sample) < n_sample:
+        index = randrange(len(dataset))
+        sample.append(dataset[index])
+    return sample
+
+# GINI index as cost function to minimize
+def gini_index(groups,classes):
+    #count all samples at split point
+    n_instances = float(sum([len(group) for group in groups]))
+    #Gini indexes weighted sum
+    Gini = 0.0
+    for group in groups:
+        size = float(len(group))
+        if size == 0.0:
+            continue
+        score = 0.0
+        #score group based on score of each class
+        for class_val in classes:
+            p = [row[-1] for row in group].count(class_val) / size
+            score += p*p
+        #weight group score b yrelative size
+        Gini += (1.0-score) * (size / n_instances)
+    return Gini
+
+#Split a dataset
+def test_split(index,value,dataset):
+    left, right = list(), list()
+    for row in dataset:
+        if row[index]< value:
+            left.append(row)
+        else:
+            right.append(row)
+    return left, right
+
+#Best splitpoint for dataset with choice of attributes to avoid redundancy
+def get_split_forest(dataset, n_attributes):
+    class_values = list(set(row[-1] for row in dataset))
+    b_index, b_value, b_score, b_groups = 999, 999, 999, None
+    attributes = list()
+    while len(attributes) < n_attributes:
+        index = randrange(len(dataset[0])-1)
+        if index not in attributes:
+            attributes.append(index)
+    for index in attributes:
+        for row in dataset:
+            groups = test_split(index, row[index], dataset)
+            Gini = gini_index(groups, class_values)
+            #print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], Gini))
+            if Gini < b_score:
+                b_index, b_value, b_score, b_groups = index, row[index], Gini, groups
+    return {'index':b_index, 'value':b_value, 'groups':b_groups}
+
+# Create a terminal node value
+def to_terminal(group):
+    outcomes = [row[-1] for row in group]
+    return max(set(outcomes), key=outcomes.count)
+
+# Create a function to split a node in function of diffrent parameters
+def split_node_forest(node, max_depth, min_size, n_attributes, depth):
+    left, right = node['groups']
+    #Delete data from node as it is no longer needed
+    del(node['groups'])
+    #Check whether left node or right node is empty to create a terminal node
+    if not left or not right:
+        node['left'] = node['right'] = to_terminal(left + right)
+        return
+    #Check if we have reached the maximum depth of the tree --> Create terminal node
+    if depth >= max_depth:
+        node['left'], node['right'] = to_terminal(left), to_terminal(right)
+        return
+    #Process on the both children, by checking if min size is reached first or further split of tree is required
+    if len(left) <= min_size:
+        node['left'] = to_terminal(left)
+    else:
+        node['left'] = get_split_forest(left, n_attributes)
+        split_node_forest(node['left'], max_depth, min_size, n_attributes,depth+1)
+    #Right child
+    if len(right) <= min_size:
+        node['right'] = to_terminal(right)
+    else:
+        node['right'] = get_split_forest(right, n_attributes)
+        split_node_forest(node['right'], max_depth, min_size, n_attributes,  depth+1)
+
+# Build a decision tree
+def build_tree(train, max_depth, n_attributes ,min_size):
+    root = get_split_forest(train, n_attributes)
+    split_node_forest(root, max_depth, min_size, n_attributes ,1)
+    return root
+
+# Print a decision tree
+def print_tree(node, depth=0):
+    if isinstance(node, dict):
+        print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value'])))
+        print_tree(node['left'], depth+1)
+        print_tree(node['right'], depth+1)
+    else:
+        print('%s[%s]' % ((depth*' ', node)))
+
+# Make a prediction with a decision tree
+def predict(node, row):
+    if row[node['index']] < node['value']:
+        if isinstance(node['left'], dict):
+            return predict(node['left'], row)
+        else:
+            return node['left']
+    else:
+        if isinstance(node['right'], dict):
+            return predict(node['right'], row)
+        else:
+            return node['right']
+
+# Make a prediction with a list of bagged trees
+def bagging_predict(trees, row):
+    predictions = [predict(tree, row) for tree in trees]
+    return max(set(predictions), key=predictions.count)
+
+
+
+# Random Forest Algorithm
+def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
+    trees = list()
+    for _ in range(n_trees):
+        sample = subsample(train, sample_size)
+        tree = build_tree(sample, max_depth, min_size, n_features)
+        trees.append(tree)
+    predictions = [bagging_predict(trees, row) for row in test]
+    return(predictions)
+
+