Skip to content

Commit

Permalink
updates documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
BenjaminDoran committed Mar 29, 2018
1 parent 53d4675 commit bc02e79
Showing 1 changed file with 38 additions and 26 deletions.
64 changes: 38 additions & 26 deletions unidip/unidip.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,45 +12,52 @@

class UniDip:
""" Class containing the UniDip clustering algorithm.
Isolates peaks in high noise samples.
Isolates peaks in high noise 1d numeric samples.
INPUT:
dat: 1d np.array of floats or ints (required)
is_hist: False, flips from looking at
x axis to density along x axis
alpha: 0.5, tuning parameter, sets significance
level of p_values
ntrials: 100, number of trials when running diptest
mrg_dst: 1, distance to merge returned intervals
debug: False, determines whether to plot
the data at each recursion level
Algorithm Ref:
title: "Skinny-dip: Clustering in a sea of noise"
author: Samuel Maurus & Claudia Plant.
url: http://www.kdd.org/kdd2016/subtopic/view/skinny-dip-clustering-in-a-sea-of-noise
"""
def __init__(self, dat, is_hist=False, alpha=0.05, ntrials=100, merge_distance=1, debug=False):
def __init__(self, dat, is_hist=False, alpha=0.05, ntrials=100, mrg_dst=1, debug=False):
self.dat = np.msort(np.array(dat)) if not is_hist else np.array(dat)
self.is_hist = is_hist
self.alpha = alpha
self.ntrials = ntrials
self.merge_distance = merge_distance
self.mrg_dst = mrg_dst
self.debug = debug

def run(self):
""" Perform unidip algorithm on 1d array
INPUT:
dat: 1d np.array of floats or ints
offset: int, offset from dat[0]
is_hist: bool, flips from looking at x axis to density along x axis
alpha: float, tuning parameter, sets significance level of p_values
_is_model: internal should not be changed
numt: int, number of trials in diptest
plotdat: none or dat, determines whether to plot the data at each recursion level
RETURNS:
list of tuples: each tuple containing the start and end indecies on the x axis.
"""
modidxs = self._unidip(0, len(self.dat), True, self.debug)
return self.merge_intervals(modidxs)

def merge_intervals(self, idxs):
""" merge intervals that are touching """
""" merge intervals that are less than merge distance apart """
midxs = []
for idx in sorted(idxs):
if not midxs:
midxs.append(idx)
else:
lower = midxs[-1]
# adjacent or overlapping (adjust MERGE_DISTANCE to merge intervals)
# adjacent or overlapping (adjust mrg_dst to merge intervals)
# that are close enough
if idx[0] - lower[1] <= self.merge_distance:
if idx[0] - lower[1] <= self.mrg_dst:
midxs[-1] = (lower[0], idx[1])
else:
midxs.append(idx)
Expand All @@ -59,7 +66,7 @@ def merge_intervals(self, idxs):
def plot(self, sub, ints, plot_style="seaborn"):
""" Plot complete data, highlight subset currently being searched,
and add vertical lines for discovered intervals. (only intervals of
the current level appear.)
the current recursion level appear.)
"""
import matplotlib.pyplot as plt
plt.style.use(plot_style)
Expand Down Expand Up @@ -88,16 +95,17 @@ def _unidip(self, start, end, is_model, debug):
""" Perform unidip algorithm on 1d array
INPUT:
dat: 1d np.array of floats or ints
offset: int, offset from dat[0]
is_hist: bool, flips from looking at x axis to density along x axis
alpha: float, tuning parameter, sets significance level of p_values
_is_model: internal should not be changed
numt: int, number of trials in diptest
plotdat: none or dat, determines whether to plot the data at each recursion level
start: 0, idx of first point in current slice
end: len(data), idx of last number in
current slice
is_model: True, always starts as true
ntrials: 100, number of trials in diptest
debug: False, determines whether to plot the
data at each recursion level
RETURNS:
list of tuples: each tuple containing the start and end indecies on the x axis.
list of tuples: each tuple containing the start and
end indicies on the x axis.
"""
dat = self.dat[start:end]
interval_idxs = list()
Expand Down Expand Up @@ -168,8 +176,12 @@ def _get_full_interval(self, mod_int):
return tuple(full_indxs)

def _mirror_data(self, dat, left):
""" Mirror dataset.
input: [1, 2, 3] output: [-2, -1, 0, 1, 2]
""" Mirrors dataset
if is_hist (measuring density)
on input = [1, 2, 3], output = [3, 2, 1, 2, 3]
else (measuring raw sample)
on input = [1, 2, 3], output = [-2, -1, 0, 1, 2]
"""
wdat = np.array(dat)
if self.is_hist:
Expand Down

0 comments on commit bc02e79

Please sign in to comment.