From 14bc299971e7b1b8f0513471294fb36cdf4a0c65 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 4 Dec 2013 14:22:14 -0800 Subject: [PATCH 1/2] add pure version of random.shuffle --- sampling/__init__.py | 2 +- sampling/core.py | 13 +++++++++++++ sampling/tests/test_core.py | 6 +++++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/sampling/__init__.py b/sampling/__init__.py index 38bfb78..b13ef62 100644 --- a/sampling/__init__.py +++ b/sampling/__init__.py @@ -1,3 +1,3 @@ -from .core import jackknife +from .core import jackknife, shuffle __version__ = '0.0.1' diff --git a/sampling/core.py b/sampling/core.py index 8fe4f3c..c3ab234 100644 --- a/sampling/core.py +++ b/sampling/core.py @@ -1,4 +1,5 @@ import itertools +import random as core_random no_replace = '__no__replace__' @@ -45,3 +46,15 @@ def jackknife(seq, replace=no_replace): it = iter(seq) yield itertools.chain(itertools.islice(it, i), replace, itertools.islice(it, 1, None)) + + +def shuffle(x, random=None): + """ Randomly reorder values of x + + Pure version of standard ``random.shuffle`` + """ + if isinstance(x, list): + x = x.copy() + x = list(x) + core_random.shuffle(x, random=random) + return x diff --git a/sampling/tests/test_core.py b/sampling/tests/test_core.py index 08baf7b..80219ad 100644 --- a/sampling/tests/test_core.py +++ b/sampling/tests/test_core.py @@ -1,4 +1,4 @@ -from sampling.core import jackknife +from sampling import jackknife, shuffle def test_jacknife(): @@ -10,3 +10,7 @@ def test_jacknife(): (0, 2, 3), (1, 0, 3), (1, 2, 0)) assert tuple(tuple(x) for x in jackknife([])) == () assert tuple(tuple(x) for x in jackknife([1], replace=0)) == ((0,),) + + +def test_shuffle(): + assert set(shuffle((1, 2, 3))) == set((1, 2, 3)) From ebbde9f05007624b3a28a696385aa4810dfa9a14 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 4 Dec 2013 17:48:30 -0800 Subject: [PATCH 2/2] add reservoir sampling data structure --- sampling/__init__.py | 2 +- sampling/core.py | 32 ++++++++++++++++++++++++++++++++ sampling/tests/test_core.py | 21 ++++++++++++++++++++- 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/sampling/__init__.py b/sampling/__init__.py index b13ef62..e0df4ea 100644 --- a/sampling/__init__.py +++ b/sampling/__init__.py @@ -1,3 +1,3 @@ -from .core import jackknife, shuffle +from .core import jackknife, shuffle, Reservoir __version__ = '0.0.1' diff --git a/sampling/core.py b/sampling/core.py index c3ab234..df45f38 100644 --- a/sampling/core.py +++ b/sampling/core.py @@ -58,3 +58,35 @@ def shuffle(x, random=None): x = list(x) core_random.shuffle(x, random=random) return x + + +class Reservoir(object): + """ Basic object for Reservoir Sampling + + >>> res = Reservoir(3) # Reservoir of size 3 + >>> for item in range(10): + ... res.add(item) + + Res contains three elements randomly chosen from ``range(10)`` + + >>> list(res) # doctest: +SKIP + [8, 3] + """ + __slots__ = 'size', 'random', 'storage', 'count' + def __init__(self, size, random=core_random.random): + self.size = size + self.random = random + self.storage = set() + self.count = 0 + + def add(self, item): + self.count += 1 + if self.count <= self.size: + self.storage.add(item) + else: + if self.random() < float(self.size) / self.count: + dropped = self.storage.pop() + self.storage.add(item) + + def __iter__(self): + return iter(self.storage) diff --git a/sampling/tests/test_core.py b/sampling/tests/test_core.py index 80219ad..2040e9a 100644 --- a/sampling/tests/test_core.py +++ b/sampling/tests/test_core.py @@ -1,4 +1,4 @@ -from sampling import jackknife, shuffle +from sampling import jackknife, shuffle, Reservoir def test_jacknife(): @@ -14,3 +14,22 @@ def test_jacknife(): def test_shuffle(): assert set(shuffle((1, 2, 3))) == set((1, 2, 3)) + + +def test_Reservoir(): + r = Reservoir(2) + r.add(1) + assert r.count == 1 + assert r.size == 2 + assert set(r) == set([1]) + + r.add(2) + assert r.count == 2 + assert r.size == 2 + print set(r) + assert set(r) == set([1, 2]) + + r.add(3) + assert r.count == 3 + assert r.size == 2 + assert tuple(sorted(r)) in ((1, 2), (1, 3), (2, 3))