-
Notifications
You must be signed in to change notification settings - Fork 2
/
mult_reg.py
100 lines (77 loc) · 3.32 KB
/
mult_reg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Single Variable Regression Model
import numpy as np
class mvr(object):
'''Multivariate linear regression.'''
def __init__(self, number_coefficients):
'''Constructor.
:param number_coefficients: How many regressors (additionally to intercept).'''
self.number_coefficients = number_coefficients
pass
def read_data(self, data_path):
'''Read and process data for use in other methods.
:param data_path: Path to data file.
:returns: data_array: Np-array with data.
'''
with open(data_path) as data:
temp_list = []
for line in data:
line = line.split() # to deal with blank
if line: # lines (ie skip them)
line = [float(i) for i in line]
temp_list.append(line)
data_array = np.array(temp_list)
return data_array
def number_observations(self, data_path):
'''Compute number of data points.
:param data_path: Path to data.
:return number_of_observations
'''
number_of_observations = 0 # initialize
with open(data_path) as data:
for line in data:
number_of_observations += 1
return number_of_observations
def cost_fct(self, theta, X, y):
return sum((np.dot(X, theta) - y) ** 2) / (2 * len(X))
def deriv(self, theta, X, y):
m = len(X) #Number of observations/rows
deriv = [] #Initialize list
for feature in range(0, m):
deriv.append(sum((np.dot(X, theta) - y) * X[:, feature]) / m)
deriv = np.array(deriv)
deriv = np.transpose(deriv)
return deriv
def minimize(self, data_path, learning_rate=0.01, convergence_treshold=0.00001):
'''Estimates the optimal parameters for MVR using Gradient Descent.
:param data_path: Path to raw data.
:param learning_rate: Coefficient of steps at each iteration.
:return theta: Vector containing estimated coefficients.
'''
data_array = self.read_data(data_path)
m = len(data_array) #Number of observations/rows
X = np.column_stack([np.ones(m), data_array[:, 0]]) # Initialize matrix of observations: Frst column all 1's, second column data.
for feature in range(1, self.number_coefficients): #Subsequently add all the columns
next_column = np.transpose(data_array[:, feature])
X = np.column_stack([X, next_column])
y = data_array[:, self.number_coefficients] #Get vector of target variable
print(X, "X")
print(y, "y")
theta = [0] * (self.number_coefficients+1) # Initialize parameter vector
theta = np.array(theta)
print(theta, "theta")
new_theta = theta - learning_rate * self.deriv(theta, X, y)
# while (np.sqrt(sum((theta - new_theta) ** 2)) > convergence_treshold):
# theta = new_theta
# new_theta = theta - learning_rate * self.deriv(theta, X, y)
#
# return theta
#
# Testing
test = mvr(4)
# print(test.number_observations("toy_data_linear.py"))
a = test.read_data("toy_data_linear.py")
#print(len(a))
print(a)
# print(test.cost_fct("toy_data_linear.py", 1, 1))
test.minimize("toy_data_linear.py")
# print(test.read_data("toy_data_linear.py")[2][1])