-
Notifications
You must be signed in to change notification settings - Fork 1
/
markov_model.py
639 lines (555 loc) · 25.5 KB
/
markov_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
import argparse
import logging
import os
import pickle
import sys
import warnings
from collections import Counter
import numpy as np
from numpy import linalg as la
import random
from scipy.stats import rankdata
from scipy.stats import gaussian_kde
from sklearn.neighbors import kneighbors_graph
from hmmlearn import hmm
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from bisect import bisect_left
plt.style.use("ggplot")
from utils import sorted_eigs
# Log everything to stdout
logging.basicConfig(stream=sys.stdout,level=logging.DEBUG)
def get_parser():
'''get_parser returns the arg parse object, for use by an external application (and this script)
'''
parser = argparse.ArgumentParser(
description="Generating diffusion maps of trajectories.")
################################################################################
# General Simulation Parameters
################################################################################
parser.add_argument("--seed",
dest='seed',
help="Seed for Markov model.",
type=int,
default=1)
parser.add_argument("--n_clusters",
dest='n_clusters',
help="n_clusters for Markov model.",
type=int,
default=30)
parser.add_argument("--method",
dest='method',
help="method for Markov model. Can be hmm or agg_clustering.",
type=str,
default="hmm")
parser.add_argument("--n_iter",
dest='n_iter',
help="n_iter for Markov model.",
type=int,
default=25)
parser.add_argument("--covariance_type",
dest='covariance_type',
help="covariance_type for Markov model. Can be spherical, diag, or full.",
type=str,
default='spherical')
parser.add_argument("--tol",
dest='tol',
help="tol for Markov model.",
type=float,
default=1e-2)
# Does the user want to quiet output?
parser.add_argument("--quiet",
dest='quiet',
action="store_true",
help="Turn off logging (debug and info)",
default=False)
parser.add_argument("--input_file_path",
dest='input_file_path',
type=str,
help="input file full path",
default=None)
parser.add_argument("--output_file_path",
dest='output_file_path',
type=str,
help="Output file full path",
default=None)
return parser
################################################################################
#### Supporting Functions
################################################################################
def make_markov_model(labels,n_clusters):
T = np.zeros((n_clusters,n_clusters))
for i in range(len(labels)-1):
j = i + 1
T[labels[i],labels[j]] += 1
row_norm = np.squeeze(np.asarray(T.sum(axis = 1)))
row_norm = np.power(row_norm,-1)
row_norm = np.nan_to_num(row_norm)
return (T.T*row_norm).T
def next_state(state,T_cum):
r = np.random.uniform()
return bisect_left(np.asarray(T_cum)[state], r)
def run_markov_chain(start_cluster, T, steps = 10000, slow_down=1):
'''
Run Markov chain for a single trajectory.
'''
T_cum = np.matrix(np.zeros_like(T)) ## CDF in indices
for i in range(T.shape[1]):
s = 0.
for j in range(T.shape[0]):
s += T[i,j]
T_cum[i,j] = s
row_sums = np.asarray([T[i,:].sum() for i in range(T.shape[1])])
## The row sums can sometimes end up not being 1.
## Assuming they are >=0, we fix that here.
for row, row_sum in enumerate(row_sums):
if row_sum == 0.: ## We can't divide by zero
T_cum[row] = 1./T_cum.shape[1]
if start_cluster == row:
raise ValueError("start_cluster has not sample points, so an "
"estimate cannot be made. Make sure to select a start_cluster "
"that has points in it.")
else:
T_cum[row] /= row_sum
current = start_cluster
outs = []
for i in range(steps):
for re in range(slow_down):
outs.append(current)
current = next_state(current,T_cum)
return np.asarray(outs)
def get_cluster_labels(X,n_clusters = 30, num_nearest_neighbors = 100):
knn_graph = kneighbors_graph(X, num_nearest_neighbors, include_self=False)
model = AgglomerativeClustering(linkage='ward',
connectivity=knn_graph,
n_clusters=n_clusters)
model.fit(X)
labels = model.labels_
return labels
def get_clusters(labels,n_clusters = 30):
clusters = [[] for _ in range(n_clusters)]
for point,cluster in enumerate(labels):
clusters[cluster].append(point)
return clusters
def get_hmm_hidden_states(X,
n_clusters = 30,
return_model = False,
n_iter=100,
tol=1e-2,
covariance_type = 'full',
random_state = 1,
verbose = False,
Ntraj = None):
if Ntraj is None:
Ntraj = 1
assert X.shape[0] % Ntraj == 0
lengths = [X.shape[0] // Ntraj] * Ntraj
hmm_model = hmm.GaussianHMM(n_components=n_clusters,
covariance_type=covariance_type,
n_iter = n_iter,
tol = tol,
random_state=random_state)
hmm_model.fit(X,lengths)
if verbose:
print("converged", hmm_model.monitor_.converged)
hidden_states = hmm_model.predict(X)
if not return_model:
return hidden_states
else:
return hidden_states, hmm_model
def get_obs_sample_points(obs_indices,traj_expects):
'''
Observed quantities at sampled points
'''
return np.asarray([traj_expects[:,l] for l in obs_indices])
def get_expect_in_clusters(obs_indices, clusters, n_clusters, obs_sample_points):
'''
Get the average expectation value in each cluster.
'''
expect_in_clusters = {}
for obs_index in obs_indices:
expect_in_clusters[obs_index] = [0.] * n_clusters
for cluster_index, cluster in enumerate(clusters):
if len(cluster) > 0:
points_in_cluster = obs_sample_points[obs_index, cluster]
expect_in_clusters[obs_index][cluster_index] = np.average(points_in_cluster)
else:
expect_in_clusters[obs_index][cluster_index] = None
return expect_in_clusters
def get_cov_in_clusters(obs_indices, clusters, obs_sample_points):
'''
Get the covariance matrix of observables in each cluster.
'''
std_in_cluster = {}
for cluster_index, cluster in enumerate(clusters):
if len(cluster) > 0:
points_in_cluster = obs_sample_points[:,cluster][obs_indices,:]
std_in_cluster[cluster_index] = np.cov(points_in_cluster)
else:
std_in_cluster[cluster_index] = None
return std_in_cluster
def get_obs_generated(obs_indices,
T_matrix, ## Transition matrix used
expect_in_clusters,
start_cluster, ## index of starting cluster.
cov_each_cluster = None,
num_steps = 10000,
n_clusters = 10,
slow_down=1,
return_state_indices_only = False,
return_expectations_only = False):
## First find the clusters at each step using the run_markov_chain function
steps = run_markov_chain(start_cluster, T_matrix, steps = num_steps, slow_down = slow_down)
if return_state_indices_only:
return steps
## when return_state_indices_only is False, we prepare first the expectation values at each cluster
obs_generated = np.asarray([[expect_in_clusters[l][cluster] for cluster in steps ] for l in obs_indices])
if return_expectations_only:
return obs_generated
## get errors of expectation values from covariance matrices
if cov_each_cluster is None:
raise ValueError("get_obs_generated requires cov_each_cluster if not returning only indices or expectations.")
counts = dict(Counter(steps).items())
mean = np.zeros(len(obs_indices))
cov_each_cluster_in_traj = {}
for k, v in counts.items():
cov = cov_each_cluster[k]
cov_each_cluster_in_traj[k] = np.random.multivariate_normal(mean, cov, v)
indices_each_cluster = {k : 0 for k in counts} ## keeps track of how many entries we used from each cluster
error_for_steps = np.zeros((len(obs_indices), num_steps*slow_down), dtype=complex)
for i in range(error_for_steps.shape[1]):
current_cluster_index = steps[i]
error_for_steps[:,i] = cov_each_cluster_in_traj[current_cluster_index][indices_each_cluster[current_cluster_index]]
indices_each_cluster[current_cluster_index] += 1
return error_for_steps + obs_generated
def get_reduced_model_time_series(expect_in_clusters,indices,point_in_which_cluster):
'''
Return the average expectation value over clusters
as a time series of the true trajectory
'''
return [[expect_in_clusters[l][point_in_which_cluster[point]]
for point in range(len(indices))] for l in obs_indices]
################################################################################
#### Plotting Functions
################################################################################
def contour_plot(Mat):
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(abs(Mat), interpolation='nearest')
fig.colorbar(cax)
plt.show()
def make_density_scatterplts(X,Y,label_shift = 0, observable_names = None,s=5):
if observable_names is None:
observable_names = [str(j+1) for j in range(X.shape[-1])]
fig, ax = plt.subplots(nrows=X.shape[-1],ncols=Y.shape[-1],figsize = (Y.shape[-1]*10,X.shape[-1]*10))
for i,row in enumerate(ax):
for j,col in enumerate(row):
col.set_title( observable_names[j] +" vs Phi" + str(i+1+label_shift))
x = X[:,i]
y = Y[:,j]
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
col.scatter(x, y, c=np.log(z), s=s, edgecolor='')
plt.show()
def ellipses_plot(X, indices, hmm_model, n_clusters, std_dev = 1):
# Calculate the point density
### from http://stackoverflow.com/questions/20105364/how-can-i-make-a-scatter-plot-colored-by-density-in-matplotlib
x = X[:,indices[0]]
y = X[:,indices[1]]
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
covariances = np.asarray([[[hmm_model.covars_[clus,n,m]
for n in indices]
for m in indices]
for clus in range(n_clusters)])
means = np.asarray([[hmm_model.means_[clus,n]
for n in indices]
for clus in range(n_clusters)])
sorted_eig_lst = [sorted_eigs(*la.eig(cov)) for cov in covariances]
angles = np.rad2deg(np.asarray([np.arctan2(*v[1][:,0]) for v in sorted_eig_lst]))
widths = [2*std_dev*np.sqrt(v[0][0]) for v in sorted_eig_lst]
heights = [2*std_dev*np.sqrt(v[0][1]) for v in sorted_eig_lst]
es = [Ellipse(xy=mean, width=width, height=height, angle=angle,
edgecolor='black', facecolor='none',linewidth = 1)
for mean,width,height,angle in zip(means,widths,heights,angles) ]
fig = plt.figure(0,figsize= (10,10))
ax = fig.add_subplot(111, )
ax.set_title("coordinates "+str(indices[0]+1) +","+str(indices[1]+1) )
for e in es:
ax.add_artist(e)
e.set_clip_box(ax.bbox)
e.set_clip_box(ax.bbox)
# ax.set_xlim(-.06, .06 )
# ax.set_ylim(-.06,.06 )
ax.scatter(x, y, c=np.log(z), s=100, edgecolor='')
################################################################################
#### Object used to hold the model
################################################################################
class dim_red:
'''
Minimal container for dimensionality reduction.
'''
def __init__(self, X, Ntraj, expects_sampled, name, obs_indices=None):
self.X = X
self.Ntraj = Ntraj
self.expects_sampled = np.concatenate(expects_sampled, axis = 0)
if obs_indices is None:
self.obs_indices = range(expects_sampled.shape[-1])
else:
self.obs_indices = obs_indices
self.name = name
def plot_obs_v_diffusion(self,
observable_names = None,
s=5,
which_obs = None):
if which_obs is None:
which_obs = range(self.expects_sampled.shape[-1])
make_density_scatterplts(self.X,
self.expects_sampled[:,which_obs],
label_shift = 0,
observable_names = observable_names,
s=s)
def plot_diffusion_v_diffusion(self,
color='percentile',
max_coord1=4,
max_coord2=4,
observable_names=None):
if observable_names is None:
observable_names = [str(i) for i in self.obs_indices]
for l in self.obs_indices:
fig = plt.figure(figsize=(max_coord2*10,max_coord1*10))
if color == 'percentile':
cols = rankdata(self.expects_sampled[:,l], "average")
else:
assert color == 'density'
for k in range(max_coord1):
for i in range(k+1,max_coord2):
x,y = self.X[:,k],self.X[:,i]
ax = fig.add_subplot(max_coord1, max_coord2, k*max_coord2+i+1)
if color == 'density':
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
cols = np.log(z)
plt.title("Log(density). Phi" + str(i+1) + " versus Phi" + str(k+1) )
else:
plt.title("Observable: " + observable_names[l] + "; Coordinates: Phi" + str(i+1) + " versus Phi" + str(k+1) )
plt.scatter(x,y, c = cols)
plt.tight_layout()
plt.show()
if color == 'density': ## only one iteration
break
def plot_diffusion_3d(self,
color_by_percentile = True,
coords = [0,1,2]):
if len(coords) != 3:
raise ValueError("number of coordinates must be 3 for 3D plot.")
num_obs = len(self.obs_indices)
fig = plt.figure(figsize=(num_obs*10,10))
for l in self.obs_indices:
ax = fig.add_subplot(1, num_obs, l+1, projection='3d')
expects_sampled_percentile = rankdata(self.expects_sampled[:,l], "average")
ax.scatter(self.X[:,coords[0]],self.X[:,coords[1]],self.X[:,coords[2]],
c = expects_sampled_percentile)
plt.show()
class markov_model_builder:
def __init__(self, dim_red=None, name=None):
if dim_red is not None:
try:
self.X = dim_red.X
except:
print('Warning: did not find reduced coordinates X in dim_red')
self.Ntraj = dim_red.Ntraj
self.expects_sampled = dim_red.expects_sampled
self.obs_indices = dim_red.obs_indices
if name is None:
self.name = dim_red.name + "_markov_builder"
else:
self.name = name
self.status = 'not attempted'
else:
print("Warning, dim_red initialized to None.")
def load(self, file_path):
f = open(file_path, 'rb')
tmp_dict = pickle.load(f)
f.close()
self.__dict__.update(tmp_dict)
def save(self, file_path):
f = open(file_path,'wb')
pickle.dump(self.__dict__, f, 2)
f.close()
def build_model(self,
n_clusters = 10,
method = 'hmm',
n_iter=1000,
covariance_type = 'full',
tol=1e-2,
get_expects = True,
get_cov = True,
random_state = 1,
verbose = True,
which_coords = 'X',
coords_indices_to_use = None):
'''
method can be 'hmm' or 'agg_clustering'.
which_coords can be 'X' for reduced coordiantes (e.g. diffusion coords)
or 'expects' for expectation values
'''
print("building model...")
if method == 'hmm' or method == 'agg_clustering':
self.method = method
else:
raise ValueError("Unknown method type. method can be 'hmm' or 'agg_clustering'. ")
self.n_clusters = n_clusters
self.random_state = random_state
if which_coords == 'X':
self.X_to_use = self.X
else:
assert which_coords == 'expects'
self.X_to_use = self.expects_sampled
if not coords_indices_to_use is None:
self.X_to_use = self.X_to_use[:,coords_indices_to_use]
if self.method == 'hmm':
self.labels, self.hmm_model = get_hmm_hidden_states(self.X_to_use,
self.n_clusters,
return_model=True,
n_iter=n_iter,
tol=tol,
covariance_type = covariance_type,
random_state=random_state,
verbose = True,
Ntraj = self.Ntraj)
self.clusters = get_clusters(self.labels,self.n_clusters)
# self.T = self.hmm_model.transmat_
self.T = make_markov_model(self.labels,self.n_clusters)
self.status = 'model built'
elif self.method == 'agg_clustering':
self.labels = get_cluster_labels(self.X_to_use, self.n_clusters)
self.clusters = get_clusters(self.labels,self.n_clusters)
self.T = make_markov_model(self.labels,self.n_clusters)
self.status = 'model built'
else:
raise ValueError("Unknown method type. method can be 'hmm' or 'agg_clustering'. ")
if get_expects:
print('get_expects is True')
self.obs_sample_points = get_obs_sample_points(self.obs_indices,self.expects_sampled)
self.expects_in_clusters = get_expect_in_clusters(self.obs_indices,self.clusters, self.n_clusters, self.obs_sample_points)
if get_cov:
print('get_cov is True')
self.cov_each_cluster = get_cov_in_clusters(self.obs_indices, self.clusters, self.obs_sample_points)
def get_ordering_by_obs(self,obs_index = 0):
assert self.status == 'model built'
obs_used = self.expects_sampled[:,obs_index]
expects_in_clusters = [np.average([obs_used[i] for i in self.clusters[k]]) for k in range(self.n_clusters) ]
D = {num:i for num,i in zip(expects_in_clusters,range(self.n_clusters))}
cluster_order = [D[key] for key in sorted(D.keys())]
return cluster_order
def plot_transition_matrix(self, obs_index = None,):
assert self.status == 'model built'
if obs_index is None:
contour_plot(self.T)
elif isinstance(obs_index,int):
cluster_order = self.get_ordering_by_obs(obs_index)
def order_indices(mat):
return np.asmatrix([[mat[i,j] for i in cluster_order] for j in cluster_order])
contour_plot(order_indices(self.T))
else:
raise ValueError("obs_index should be None or an integer (representing an observable).")
def ellipses_plot(self,indices = [0,1]):
'''
Works only for hmm, using the diffusion coordinates X.
'''
assert self.status == 'model built'
ellipses_plot(self.X_to_use,indices,self.hmm_model,self.n_clusters)
def generate_obs_traj(self,
steps=10000,
random_state=1,
start_cluster=None,
slow_down=1,
return_state_indices_only=False,
return_expectations_only=False,
obs_indices=None):
assert self.status == 'model built'
if obs_indices is None:
obs_indices = self.obs_indices
np.random.seed(random_state)
## not provided, uses starting index in first train trajectory
if start_cluster is None:
for i, cluster in enumerate(self.clusters):
if len(cluster) > 0 and cluster[0] == 0:
start_cluster = i
## if it's still None, then we haven't found the initial state. Something went wrong...
if start_cluster is None:
raise ValueError("start_cluster not provided, and starting cluster not found.")
return get_obs_generated(obs_indices,
self.T, ## Transition matrix used
self.expects_in_clusters,
start_cluster = start_cluster, ## index of starting cluster
cov_each_cluster=self.cov_each_cluster,
num_steps = steps,
n_clusters = self.n_clusters,
slow_down=slow_down,
return_state_indices_only = return_state_indices_only,
return_expectations_only = return_expectations_only)
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
output_file_path = args.output_file_path
input_file_path = args.input_file_path
seed = args.seed
n_clusters = args.n_clusters
method = args.method
n_iter = args.n_iter
covariance_type = args.covariance_type
tol = args.tol
pkl_file = open(input_file_path, 'rb')
data1 = pickle.load(pkl_file)
Ntraj = len(data1['traj_list'])
if len(data1['expects'].shape) == 2:
## data1['expects'].shape = points_total, observables
## reshape into Ntraj, points_per_traj, observables
expects_sampled = data1['expects'].reshape(Ntraj,
int(data1['expects'].shape[0]/Ntraj),
data1['expects'].shape[-1])
elif len(data1['expects'].shape) == 1:
## data1['expects'].shape = points_total * observables,
## This resulted from a bug in some datasets, which has been fixed.
## The trajectories should have been fixed, but include this here just in case.
num_expects = int((data1['expects']).shape[0] / data1['times'].shape[0])
expects_sampled = data1['expects'].reshape(Ntraj,
int(data1['expects'].shape[0]/(Ntraj*num_expects)),
num_expects)
else:
raise ValueError("Unknown data expects shape.")
vals, vecs = data1['vals'], data1['vecs']
name = input_file_path
dim_red_obj = dim_red(vecs, Ntraj, expects_sampled, name)
mod = markov_model_builder(dim_red_obj)
## Sometimes hmmlearn raises a ValueError for a bad seed:
## ValueError: startprob_ must sum to 1.0 (got nan)
## Below we handle this. The successful seed value is recorded in
## the markov_model_builder class.
tries = 0
max_tries = 10
while tries < max_tries:
try:
with warnings.catch_warnings():
warnings.filterwarnings("ignore",category=DeprecationWarning)
mod.build_model(n_clusters=n_clusters,
method=method,
n_iter=n_iter,
covariance_type=covariance_type,
tol=tol,
get_expects=True,
random_state=seed,
verbose=True,
which_coords='X',
coords_indices_to_use=None)
break ## success, leave the while loop
except ValueError:
seed += 1
tries += 1
mod.save(output_file_path)
# load_trajectory('diffusion_map_fb1b6ff0eb4b6940e9312f91b2b244d82e1dee2129d0e81e1ee6851b1d9d864a.pkl')
# markov_model_fb1b6ff0eb4b6940e9312f91b2b244d82e1dee2129d0e81e1ee6851b1d9d864a.pkl
# markov_model_fe1ff70dd598fc2a7203b18f6bd47c6e7410191c86afd86fd423f1f534321299
# markov_model_0fcfe35345366834103e42b8981e15a37acd84b938c78f1359332d02a0ba9398