Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ networkx>=2.2.0
matplotlib
seaborn
statsmodels
quadpy
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def setup_package():
install_requires=['numpy>=1.8.0', 'scipy>=0.13',
'pandas>=0.13.0', 'h5py>=2.2.0',
'seaborn>=0.7.0', 'networkx>=2.0.0',
'emcee>=2.0.0', 'statsmodels', 'numba'],
'emcee>=2.0.0', 'statsmodels', 'numba',
'quadpy'],
package_data={'': ['*.txt']},
license='BSD-3clause',
author_email='[email protected]',
Expand Down
2 changes: 1 addition & 1 deletion starvine/mvar/mv_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def matrixPairPlot(data_in, weights=None, corr_stat="kendalltau", **kwargs):
else:
data = data_in
upper_kde = kwargs.pop("kde", False)
pair_plot = sns.PairGrid(data, palette=["red"], size=kwargs.pop("size", 5))
pair_plot = sns.PairGrid(data, palette=["red"], height=kwargs.pop("size", 5))
# UPPER
if upper_kde:
pair_plot.map_upper(sns.kdeplot, cmap="Blues_d")
Expand Down
189 changes: 185 additions & 4 deletions starvine/vine/C_vine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,29 @@
from pandas import DataFrame
from scipy.optimize import minimize
import networkx as nx
from itertools import product
from starvine.bvcopula import pc_base as pc
import scipy.integrate as spi
import numpy as np
from six import iteritems
from starvine.vine.tree import Vtree

def splitcubes(K, d, bounds_list):
coords = [np.linspace(bounds_list[i][0] , bounds_list[i][1], num=K + 1) for i in range(d)]
grid = np.stack(np.meshgrid(*coords)).T

ks = list(range(1, K))
for slices in product(*([[slice(b,e) for b,e in zip([None] + ks, [k+1 for k in ks] + [None])]]*d)):
yield grid[slices]

def cubesets(K, d, bounds_list=()):
assert len(bounds_list) == d
assert len(bounds_list[0]) == 2
if (K & (K - 1)) or K < 2:
raise ValueError('K must be a positive power of 2. K: %s' % K)

return list([set(tuple(p.tolist()) for p in c.reshape(-1, d)) for c in splitcubes(K, d, bounds_list)])


class Cvine(BaseVine):
"""!
Expand Down Expand Up @@ -108,6 +126,160 @@ def buildDeepTrees(self, level=1):
elif level == self.nLevels - 1:
self.vine[level].evalH()

def _lnpdf(self, x, **kwargs):
vine_cond_x = []
vine_cond_x.append(self.vine[0].evalHData(x))
for lvl in range(1, self.nLevels):
vine_cond_x.append(self.vine[lvl].evalHData(vine_cond_x[lvl - 1]))
"""
sum_pdf = np.zeros(len(x))
total_n_copula = np.math.factorial(self.nLevels)
for u_id, v_id in self.vine[0].tree.edges(data=False):
x_lvl = x

rootID = self.vine[0].rootNodeID
if u_id is not rootID:
nonRootID = u_id
else:
nonRootID = v_id

u, v = np.asarray(x_lvl[nonRootID]), np.asarray(x_lvl[rootID])
# u, v = np.asarray(x_lvl[u_id]), np.asarray(x_lvl[v_id])

# tree_wgt = len(self.vine[0].tree.edges)
tree_wgt = total_n_copula
sum_pdf += \
self.vine[0].tree.adj[nonRootID][rootID]["pc"].copulaModel.pdf( \
u, v) / tree_wgt

for lvl in range(1, self.nLevels):
for u_id, v_id in self.vine[lvl].tree.edges(data=False):
x_lvl = vine_cond_x[lvl - 1]

rootID = self.vine[lvl].rootNodeID
if u_id is not rootID:
nonRootID = u_id
else:
nonRootID = v_id

# u, v = np.asarray(x_lvl[u_id]), np.asarray(x_lvl[v_id])
u, v = np.asarray(x_lvl[nonRootID]), np.asarray(x_lvl[rootID])

# tree_wgt = len(self.vine[lvl].tree.edges)
tree_wgt = total_n_copula
sum_pdf += \
self.vine[lvl].tree.adj[nonRootID][rootID]["pc"].copulaModel.pdf( \
u, v) / tree_wgt
return sum_pdf
"""
ln_pdf = np.zeros(len(x))
total_n_copula = np.math.factorial(self.nLevels)
for u_id, v_id in self.vine[0].tree.edges(data=False):
x_lvl = x

rootID = self.vine[0].rootNodeID
if u_id is not rootID:
nonRootID = u_id
else:
nonRootID = v_id

# u, v = np.asarray(x_lvl[u_id]), np.asarray(x_lvl[v_id])
u, v = np.asarray(x_lvl[nonRootID]), np.asarray(x_lvl[rootID])

ln_pdf += \
np.log(self.vine[0].tree.adj[nonRootID][rootID]["pc"].copulaModel.pdf( \
u, v))

for lvl in range(1, self.nLevels):
for u_id, v_id in self.vine[lvl].tree.edges(data=False):
x_lvl = vine_cond_x[lvl - 1]

rootID = self.vine[lvl].rootNodeID
if u_id is not rootID:
nonRootID = u_id
else:
nonRootID = v_id

# u, v = np.asarray(x_lvl[u_id]), np.asarray(x_lvl[v_id])
u, v = np.asarray(x_lvl[nonRootID]), np.asarray(x_lvl[rootID])

ln_pdf += \
np.log(self.vine[lvl].tree.adj[nonRootID][rootID]["pc"].copulaModel.pdf( \
u, v))
return ln_pdf

def _pdf(self, x, **kwargs):
assert isinstance(x, DataFrame)
return np.exp(self._lnpdf(x, **kwargs))

def _cdf(self, x, k_split=2, **kwargs):
"""
@brief CDF via numerical integration of PDF.
@param x pandas dataframe of locations at which to evaluate CDF.
@param k_split splits the [0,1]^D dimensional hypercube
over which the copula PDF is supported over into k_split chunks
in each dimension. Ex: a copula of D=3 with k_split=2
would have 8 equally sized cubical integration zones.
"""
# note: quadpy makes use of mypy
import quadpy
assert isinstance(x, DataFrame)
dim = len(x.columns)
scheme = quadpy.ncube.stroud_cn_5_9(dim)
cdf_vector = np.zeros(len(x))
points_vector = np.ones(len(x)) * 0.0
def _pdf(*x):
# x should always have even len
col_labels = x[int(len(x)/2):]
x_dframe = DataFrame()
for i, col_label in enumerate(col_labels):
x_dframe[col_label] = [x[i]]
pdf_x = self._pdf(x_dframe)
print(x[0:int(len(x)/2)], pdf_x)
return pdf_x

def f_pdf(x_np):
nps = x_np.shape
x_dframe = DataFrame()
for k in range(nps[0]):
x_dframe[x.columns[k]] = x_np[k, :]
pdf_x = self._pdf(x_dframe)
pdf_x[np.isnan(pdf_x)] = np.max(pdf_x)
return pdf_x

edge_tol = 1e-8
for i, x_i in x.iterrows():
ranges = np.zeros((len(x_i), 2)) + edge_tol
col_labels = []
j=0
for col_name, col_data in x_i.iteritems():
col_labels.append(col_name)
ranges[j, 1] = np.clip(col_data, edge_tol, 1. - 0.1 * edge_tol)
j += 1

sub_cubes = cubesets(k_split, dim, ranges)
for k, sub_cube in enumerate(sub_cubes):
sub_cube_coords = np.asarray(list(sub_cube))
sub_cube_ranges = np.array([np.min(sub_cube_coords, axis=0), np.max(sub_cube_coords, axis=0)]).T
# print("Integrating subcube %d out of %d" % (int(k), len(sub_cubes)))
cdf_vector[i] += scheme.integrate(f_pdf,
quadpy.ncube.ncube_points(
*[r for r in sub_cube_ranges])
)

# cdf_vector[i] = scheme.integrate(f_pdf,
# quadpy.ncube.ncube_points(
# *[r for r in ranges])
# )

#res = spi.nquad(_pdf, ranges,
# args=(col_labels),
# opts={'limit': 2, 'epsabs': 1e-2, 'epsrel': 1e-2,
# 'points': points_vector, 'limlst': 8})[0]
#print("Vine CDF abs err est at xi=%s: %0.5e" % (str(x_i), res[1]))
#cdf_vector[i] = res[0]
return cdf_vector


class Ctree(Vtree):
"""!
Expand Down Expand Up @@ -222,16 +394,25 @@ def _evalH(self):
rootID = self.rootNodeID
if u is not rootID:
nonRootID = u
nonRootData = data["pc"].UU
rootData = data["pc"].VV
else:
nonRootID = v
nonRootData = data["pc"].VV
rootData = data["pc"].UU
condData[(nonRootID, rootID)] = data["h-dist"](data["pc"].VV,
data["pc"].UU)
return condData

def evalHData(self, x):
assert isinstance(x, DataFrame)
condData = DataFrame()
for u, v, data in self.tree.edges(data=True):
rootID = self.rootNodeID
if u is not rootID:
nonRootID = u
else:
nonRootID = v
condData[(nonRootID, rootID)] = data["h-dist"](x[v],
x[u])
return condData

def _getEdgeCopulaParams(self, u, v):
"""!
@brief Get copula paramters of particular edge in tree.
Expand Down
13 changes: 13 additions & 0 deletions starvine/vine/base_vine.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def _validate_trial_copula(self, trial_copula):
for key, val in iteritems(trial_copula):
assert key in self._all_trial_copula
assert self._all_trial_copula[key] == val
return trial_copula

@property
def _all_trial_copula(self):
Expand All @@ -51,6 +52,18 @@ def loadVineStructure(self, vS):
"""
pass

def vinePdf(self, x, level=1, **kwargs):
return self._pdf(x)

def vineCdf(self, x):
return self._cdf(x)

def _pdf(self, x, level=1, **kwargs):
raise NotImplementedError

def _cdf(self, x, level=1, **kwargs):
raise NotImplementedError

def vineNLLH(self, vineParams=[None], **kwargs):
"""!
@brief Compute the vine negative log likelihood. Used for
Expand Down
3 changes: 2 additions & 1 deletion starvine/vine/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ def unrollNodes(l):
tree_num = 0
else:
# Determine tree level from number of nodes (#nodes are powers of 2)
tree_num = int(np.log(len(unrollNodes(n0))) / np.log(2)) - 1
# tree_num = int(np.log(len(unrollNodes(n0))) / np.log(2)) - 1
tree_num = len(np.unique(np.asarray(n0).flatten())) - 1
current_tree = vine[tree_num].tree
next_tree = vine[tree_num + 1].tree
edge_info = current_tree[n0][n1]
Expand Down
29 changes: 28 additions & 1 deletion tests/test_C_vine_construct.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from scipy.stats import norm, beta
import unittest
import os
from scipy.optimize import minimize
import numpy as np
import pandas as pd
pwd_ = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -71,10 +72,14 @@ def testCvineConstruct(self):
self.assertTrue(np.allclose(tst_rho_matrix - sample_rho_matrix, 0, atol=0.10))
self.assertTrue(np.allclose(tst_ktau_matrix - sample_ktau_matrix, 0, atol=0.10))

def opti_wrap(fun, x0, args, disp=0, **kwargs):
return minimize(fun, x0, args=args, method='Nelder-Mead',
tol=1e-12, options={'maxiter': 4000}).x

# fit marginal distributions to original data
marginal_dict = {}
for col_name in tstData.columns:
marginal_dict[col_name] = beta(*beta.fit(tstData[col_name]))
marginal_dict[col_name] = beta(*beta.fit(tstData[col_name], optimizer=opti_wrap))
# scale the samples
c_vine_scaled_samples_a = tstVine.scaleSamples(c_vine_samples, marginal_dict)
matrixPairPlot(c_vine_scaled_samples_a, savefig="vine_varaite_resampled_scaled_a.png")
Expand All @@ -89,6 +94,28 @@ def testCvineConstruct(self):
self.assertTrue(np.allclose(tst_rho_matrix - sample_scaled_rho_matrix_a, 0, atol=0.1))
self.assertTrue(np.allclose(tst_rho_matrix - sample_scaled_rho_matrix_b, 0, atol=0.1))

# check vine pdf values
tstX = pd.DataFrame()
tstX['1a'] = [0.01, 0.4, 0.5, 0.6, 0.99, 0.999]
tstX['2b'] = [0.01, 0.4, 0.5, 0.6, 0.99, 0.999]
tstX['3c'] = [0.01, 0.4, 0.5, 0.6, 0.99, 0.999]
tstX['4d'] = [0.01, 0.4, 0.5, 0.6, 0.99, 0.999]
tstX['5e'] = [0.01, 0.4, 0.5, 0.6, 0.99, 0.999]
pdf_at_tstX = tstVine.vinePdf(tstX)
print("PDF at: ", tstX)
print(pdf_at_tstX)
self.assertTrue(np.all(pdf_at_tstX > 0.0))

# check vine cdf values
cdf_at_tstX = tstVine.vineCdf(tstX)
Copy link
Owner Author

@wgurecky wgurecky Oct 10, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same with CDF?

print("CDF at: ", tstX)
print(cdf_at_tstX)
cdf_tol = 0.05
self.assertTrue(cdf_at_tstX[1] > cdf_at_tstX[0])
self.assertTrue(np.all(0.0 <= cdf_at_tstX))
self.assertTrue(np.all(cdf_at_tstX <= 1.0 + cdf_tol))
self.assertAlmostEqual(cdf_at_tstX[-1], 1.0, delta=cdf_tol)


if __name__ == "__main__":
unittest.main()