forked from zygmuntz/phraug
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathshuffle.py
85 lines (58 loc) · 1.48 KB
/
shuffle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
Shuffle lines in a [big] file
shuffle.py input_file.csv output_file.csv [<max. lines in memory>] [<random seed>]
"""
import sys
import random
input_file = sys.argv[1]
output_file = sys.argv[2]
try:
lines_in_memory = int( sys.argv[3] )
except IndexError:
lines_in_memory = 25000
print "caching %s lines at a time..." % ( lines_in_memory )
try:
random_seed = sys.argv[4]
random.seed( random_seed )
print "random seed: %s" % ( random_seed )
except IndexError:
pass
# first count
print "counting lines..."
f = open( input_file )
count = 0
for line in f:
count += 1
if count % 100000 == 0:
print count
print count
# then shuffle
print "shuffling..."
o_f = open( output_file, 'wb' )
order = range( count )
random.shuffle( order )
epoch = 0
while order:
current_lines = {}
current_lines_count = 0
current_chunk = order[:lines_in_memory]
current_chunk_dict = { x: 1 for x in current_chunk } # faster "in"
current_chunk_length = len( current_chunk )
order = order[lines_in_memory:]
f.seek( 0 )
count = 0
for line in f:
if count in current_chunk_dict:
current_lines[count] = line
current_lines_count += 1
if current_lines_count == current_chunk_length:
break
count += 1
if count % 100000 == 0:
print count
print "writing..."
for l in current_chunk:
o_f.write( current_lines[l] )
lines_saved = current_chunk_length + epoch * lines_in_memory
epoch += 1
print "pass %s complete (%s lines saved)" % ( epoch, lines_saved )