Skip to content

Commit 39be764

Browse files
committed
percentage progress output during parse. fixed sentence separator. added gitignore
1 parent e2eda8f commit 39be764

File tree

2 files changed

+10
-3
lines changed

2 files changed

+10
-3
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
*.pyc
2+
*.db
3+
*.txt
4+
.DS_Store

parse.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from __future__ import division
12
import sqlite3
23
import codecs
34
import sys
@@ -6,17 +7,19 @@ class Parser:
67
SENTENCE_START_SYMBOL = '^'
78
SENTENCE_END_SYMBOL = '$'
89

9-
def __init__(self, name, db):
10+
def __init__(self, name, db, split_char = '.'):
1011
self.name = name
1112
self.db = db
13+
self.split_char = split_char
1214

1315
def save_word_pair(self, word1, word2):
1416
self.db.add_word(word1, word2)
1517

1618
def parse(self, file_name):
1719
txt = codecs.open(file_name, 'r', 'utf-8').read()
18-
sentences = txt.split('\n')
20+
sentences = txt.split(self.split_char)
1921
i = 0
22+
l = len(sentences)
2023

2124
for sentence in sentences:
2225
words = sentence.split()
@@ -30,7 +33,7 @@ def parse(self, file_name):
3033
self.db.commit()
3134
i += 1
3235
if i % 1000 == 0:
33-
print i
36+
print '%d%% complete' % (100 * i / l,)
3437
sys.stdout.flush()
3538

3639

0 commit comments

Comments
 (0)