-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinear_regression.py
283 lines (229 loc) · 13.3 KB
/
linear_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
from networkx.algorithms.shortest_paths.weighted import all_pairs_dijkstra_path_length
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import wntr
import wntr_WSN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx_graph as ng
import sklearn
from sklearn.model_selection import train_test_split
def model_network_with_linreg(n: any) -> list:
"""
Model every node in the water network with an sklearn.linear_regression() model based on readings from
n closest sensors
Arguments: n - amount of closest sensors to read data from. Uses n of each, eg. if n=1 readings from the closest
flowmeter AND the closest pressure meter are used (int or str 'all')
Returns: list of dicts describing each node in the network {'node', 'regression', 'msq', 'r2'}
where: node - node name (str)
regression - sklearn regression object
msq - mean squared error (float)
r2 - R2 score (float)
sensors - list of strings, names of sensors used
"""
print("Modeling the network with linreg (linear_regression.py - model_network_with_linregression())")
# collect data
G = ng.create_graph() # get NetworkX graphs
amount_of_junctions = len(G.nodes()) # for printing progress
amount_of_pipes = len(G.edges()) # for printing progress
shortest_paths = list(all_pairs_dijkstra_path_length(G)) # get shortest distances between all node pairs
sim_results = wntr_WSN.get_sim_results() # no leak simulation results
all_pressure_readings, all_flowrate_readings = get_all_sensor_readings(sim_results) # extract readings for sensors we have
# preprocess X data
data_X = pd.concat([all_pressure_readings, all_flowrate_readings], axis=1, join='inner') # join readings into a single pandas DF
data_X = data_X.loc[:, ~data_X.columns.duplicated()] # remove duplicate columns
normalization_params = {'mean': data_X.mean(), 'std': data_X.std()} # collect normalization params
data_X = (data_X - normalization_params['mean']) / normalization_params['std'] # normalise X set
# for each node in graph create a separate sklearn linear_regression() object
linreg_models = []
for i, node in enumerate(G.nodes()):
print(f'\tModeling junction {node}: ({i+1} out of {amount_of_junctions})') # printing progress
data_X_scope = data_X # copy of data_X inside for scope
sensors = ng.get_closest_sensors(G=G, # retrieve sensors closest to node
central_node=node,
n=n, # nr of sensors
shortest_paths=shortest_paths)
sensors_names = [i[0] for i in sensors] # and flowrate readings, get rid of distances
data_X_scope = data_X_scope[sensors_names] # extract data for the closest sensors
data_y = sim_results.node['pressure'][node] # get y data for supervised learning
train_X, test_X, train_y, test_y = train_test_split(data_X_scope, # split and shuffle dataset
data_y,
test_size=0.1)
linreg_models.append(create_linreg_model(node, train_X, test_X, train_y, test_y)) # model the node and add it to the list
return linreg_models, normalization_params
def get_all_sensor_readings(sim_results: wntr.sim.results.SimulationResults)\
-> [pd.core.frame.DataFrame, pd.core.frame.DataFrame]:
"""
Retrieve flowrate and pressure readings of all sensors
throughout the whole simulation time
Arguments: WNTR simulation results object (wntr.sim.results.SimulationResults)
Returns: list of two pandas DataFrames [pressure_readings, flowrate_readings]
"""
# retrieve flowrate and pressure sensors names
sensors = ng.get_sensor_names()
pressure_sensors = sensors['pressures']
flowrate_sensors = sensors['flows']
# get sim results for all nodes
all_pressure_results = sim_results.node['pressure']
all_flowrate_results = sim_results.link['flowrate']
# leave only data we can measure by sensors
pressure_readings = all_pressure_results[pressure_sensors]
flowrate_readings = all_flowrate_results[flowrate_sensors]
return [pressure_readings, flowrate_readings]
def create_linreg_model(node, train_X, test_X, train_y, test_y) -> dict:
"""
Creates sklearn linear regression object for a given node
Arguments: node - node name (str)
train_X, test_X - (DF)
train_y, test_y - (series)
Returns: dict {'node', 'regression', 'msq', 'r2'}
where: node - node name (str)
regression - sklearn regression object
msq - mean squared error (float)
r2 - R2 score (float)
sensors - list of strings, names of sensors used
"""
# create model and fit the data
regression = linear_model.LinearRegression()
regression.fit(train_X, train_y)
# gather predictions on test set
test_pred = regression.predict(test_X)
# evaluate the model
msq = mean_squared_error(test_y, test_pred)
r2 = r2_score(test_y, test_pred)
return {'node': node, 'linreg': regression, 'msq': msq, 'r2': r2, 'sensors': train_X.columns.tolist()}
def predict_pressure_on_leaks(linreg_models: list, norm_params: dict, leak_start = 10, leak_end = 40, leak_area = 0.0001) -> pd.core.frame.DataFrame:
"""
For every modeled node in the network simulate hydraulics with it leaking
Arguments: linreg_models - list of dicts from model_network_with_linreg()
norm_params - dict of parameters, also from model_network_with_linreg()
leak_start - leak start time in hours (int)
leak_end - leak end time in hours (int)
leak_area - area of the leak (meters squared)
Returns: DataFrame where each node has its separate column with pressures predicted for a scenario of its leak, index is the time in seconds
"""
G = ng.create_graph()
leak_predictions = {}
temp = 0#TEMP simulate leak only for first 4 nodes, to save time in debugging---------------------------------------------------------
# for each node simulate its leak, and predict its pressure using its linear regression model
for node in G.nodes():
print(f'Simulating node {node} with leak')
temp += 1#TEMP simulate leak only for first 4 nodes, to save time in debugging---------------------------------------------------------
if temp>10:#TEMP simulate leak only for first 4 nodes, to save time in debugging---------------------------------------------------------
break#TEMP simulate leak only for first 4 nodes, to save time in debugging---------------------------------------------------------
try:
# simulate hydraulics with a leak on this particular node
sim_results_L = wntr_WSN.get_sim_results_LEAK(node=node, area=leak_area, start_time=leak_start, end_time=leak_end)
# find linear regression model for this particular node
for model in linreg_models:
if model['node'] == node:
linreg_dict = model
# join flowrate and pressure measurements into a single DataFrame
sim_results_L = pd.concat([sim_results_L.node['pressure'], sim_results_L.link['flowrate']], axis=1)
# choose only the available sensors
sim_results_L = sim_results_L[linreg_dict['sensors']]
# choose only leak timeframe (index values in seconds)
sim_results_L = sim_results_L.loc[[3600*i for i in range(leak_start, leak_end+1)]]
# normalise the input data with respective parameters
sim_results_L = (sim_results_L - norm_params['mean'].loc[linreg_dict['sensors']]) / norm_params['std'].loc[linreg_dict['sensors']]
# predict pressure for each node
leak_predictions[node] = linreg_dict['linreg'].predict(sim_results_L)
except Exception as e:
# sometimes an internal WNTR error happens, I didn't manage to find out why (function convergence error)
print(f'Runtime error on simulation of node {node} with leak (convergence failed)\nError message:\n{e}')
# create a DataFrame of results, index is time of leak in seconds
index = [3600*i for i in range(leak_start, leak_end+1)]
results = pd.DataFrame(data=leak_predictions, index=index)
return results
def find_residuals(pressure_predictions_L: pd.core.frame.DataFrame, leak_start: int, leak_end: int) -> pd.core.frame.Series:
"""
For every modeled node calculate residuum used for leak detection. Residuum is a diffrence between:
y - actual pressure values coming from a simulation WITHOUT leaks
y_hat - values predicted by linear regression models using data from simulations WITH leaks
The function calculates diffrences in leak timeframe and returns the minimal value as a residual
Arguments: pressure_predictions_L - y mentioned above, from predict_pressure_on_leaks()
leak_start - leak start time in hours (int)
leak_end - leak end time in hours (int)
Returns: pd.core.frame.Series - Series of residuals for each modeled node
"""
G = ng.create_graph()
# get simulation results for the network WITHOUT leaks
sim_results_nL = wntr_WSN.get_sim_results()
# select the leak timeframe (index values in seconds)
nodes_pressure_nL = sim_results_nL.node['pressure'].loc[[3600*i for i in range(leak_start, leak_end+1)]]
# select columns corresponding to nodes that have predictions ready (to subtract DataFrames cleanly)
nodes_pressure_nL = nodes_pressure_nL[pressure_predictions_L.columns.tolist()]
# get absolute diffrence, then choose the minima for each node (column)
#results = (nodes_pressure_nL - pressure_predictions_L).abs()
#print(results)
results = (nodes_pressure_nL - pressure_predictions_L).abs().min(axis=0)
return results
def check_network_for_leaks_v1(residuals: pd.core.frame.Series, sim_results_nL: wntr.sim.results.SimulationResults,
test_sim: wntr.sim.results.SimulationResults, linreg_models: dict,
norm_params: dict) -> pd.core.frame.DataFrame:
"""
Attempt at creating an algorithm for finding leaks. Long story short, calculates the diffrence:
simulation results with NO LEAK (MINUS) linreg predictions based on limited number of sensors in real-time
Arguments: residuals - residuums calculated in find_residuals() (not currently used due to algorithm not working well enough)
sim_results_nL - simulation results of the water network with a leak on a particular nodee
test_sim - simulation results of the case we want to predict leaks on
linreg_models - linear regression models for each node (created by model_network_with_linreg())
norm_params - normalization params for input data for linreg models (created by model_network_with_linreg())
Returns: results - DF of diffrences explained in the function comment header
"""
leak_predictions = {}
# join flowrate and pressure measurements into a single DataFrame
test_sim = pd.concat([test_sim.node['pressure'], test_sim.link['flowrate']], axis=1)
for node in list(residuals.index.values):
# find linear regression model for this particular node
for model in linreg_models:
if model['node'] == node:
linreg_dict = model
# choose only the available sensors
sim_results_to_check = test_sim[linreg_dict['sensors']]
# normalise the input data with respective parameters
sim_results_to_check = (sim_results_to_check - norm_params['mean'].loc[linreg_dict['sensors']]) / norm_params['std'].loc[linreg_dict['sensors']]
# predict pressure for each node
leak_predictions[node] = linreg_dict['linreg'].predict(sim_results_to_check)
index = [3600*i for i in range(len(test_sim.index))] # indexes for proper DF creation
results = pd.DataFrame(data=leak_predictions, index=index) # DF of linreg pressure predictions
#results = results[list(residuals.index)] # filter only predictions we have residuals for
sim_results_nL = sim_results_nL.node['pressure'][list(residuals.index)] # get simulation results of no leak
results = (sim_results_nL - results) # get diff of the two
results.plot()
plt.show()
return results
def check_network_for_leaks_v1_eval():
"""
Setup for evaluation of leak finding algorithm - check_network_for_leaks_v1()
Arguments: None
Returns: None
"""
# params for finding residuals between simulation with leaks and linreg predictions
leak_start = 1
leak_end = 30
leak_area = 0.0001
# model the whole network with linreg models
models, normalization_params = model_network_with_linreg(n='all')
# get pressure predictions for each node when there's a leak on this exact node
pressure_predicted_L = predict_pressure_on_leaks(linreg_models=models, norm_params=normalization_params,
leak_start=leak_start, leak_end=leak_end, leak_area=leak_area)
# calculate the residuals
residuals = find_residuals(pressure_predictions_L=pressure_predicted_L, leak_start=leak_start, leak_end=leak_end)
# create NetworkX graph of water network
G = ng.create_graph()
# get simulation results for the network WITHOUT leaks
sim_results_nL = wntr_WSN.get_sim_results()
# params for simulation of the node we'll try to detect a leak on
node = 'J4'
test_leak_start = 10
test_leak_end = 60
test_leak_area = 0.0001
# simulate leak on a node we'll try to detect a leak on
network_to_be_checked = wntr_WSN.get_sim_results_LEAK(node=node, area=test_leak_area, start_time=test_leak_start, end_time=test_leak_end)
# check said network for leaks, get graphs of v1 algorithm values over time
temp = check_network_for_leaks(residuals, sim_results_nL, sim_results_nL, models, normalization_params)
return
if __name__ == '__main__':
pass