Skip to content

Commit e2eda8f

Browse files
committed
init
0 parents  commit e2eda8f

File tree

4 files changed

+135
-0
lines changed

4 files changed

+135
-0
lines changed

db.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import sqlite3
2+
3+
class Db:
4+
def __init__(self, name):
5+
self.name = name
6+
self.conn = sqlite3.connect(name + '.db')
7+
c = self.conn.cursor()
8+
c.execute('CREATE TABLE IF NOT EXISTS words (word, next_word, count)')
9+
c.execute('CREATE INDEX IF NOT EXISTS i_word ON words (word,next_word)')
10+
11+
def get_word_pair_count(self, word, next_word):
12+
c = self.conn.cursor()
13+
c.execute('select count from words where word=? and next_word=?' , (word, next_word))
14+
r = c.fetchone()
15+
if r:
16+
return r[0]
17+
else:
18+
return 0
19+
20+
def add_word(self, word, next_word):
21+
count = self.get_word_pair_count(word, next_word)
22+
c = self.conn.cursor()
23+
if count:
24+
c.execute('UPDATE words SET count=? WHERE word=? AND next_word=?', (count + 1, word, next_word))
25+
else:
26+
c.execute('INSERT INTO words (word, next_word, count) VALUES (?,?,?)', (word, next_word, 1))
27+
28+
def commit(self):
29+
self.conn.commit()
30+
31+
def get_word_count(self, word):
32+
c = self.conn.cursor()
33+
counts = {}
34+
for row in c.execute('SELECT next_word, count FROM words WHERE word=?', (word,)):
35+
counts[row[0]] = row[1]
36+
37+
return counts
38+
39+
def reset(self):
40+
c = self.conn.cursor()
41+
c.execute('delete from words')
42+
self.conn.commit()

gen.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import sqlite3
2+
from parse import Parser
3+
from random import randint
4+
5+
class Generator:
6+
def __init__(self, name, db):
7+
self.name = name
8+
self.db = db
9+
10+
def get_next_word(self, word):
11+
candidate_words = self.db.get_word_count(word)
12+
total_next_words = sum(candidate_words.values())
13+
i = randint(1, total_next_words)
14+
t=0
15+
for w in candidate_words.keys():
16+
t += candidate_words[w]
17+
if (i <= t):
18+
return w
19+
assert False
20+
21+
def make_sentence(self):
22+
word = self.get_next_word(Parser.SENTENCE_START_SYMBOL)
23+
sentence = []
24+
25+
while word != Parser.SENTENCE_END_SYMBOL:
26+
sentence.append(word)
27+
word = self.get_next_word(word)
28+
29+
return ' '.join(sentence)
30+
31+
def generate(self, count):
32+
for i in range(0, count):
33+
print self.make_sentence()

markov.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from db import Db
2+
from gen import Generator
3+
from parse import Parser
4+
import sys
5+
6+
7+
if __name__ == '__main__':
8+
args = sys.argv
9+
usage = 'Usage: %s (parse <name> <path to txt file>|gen <name> <count>)' % (args[0], )
10+
11+
if (len(args) != 4):
12+
raise ValueError(usage)
13+
14+
mode = args[1]
15+
name = args[2]
16+
db = Db(name)
17+
if mode == 'parse':
18+
file_name = args[3]
19+
Parser(name, db).parse(file_name)
20+
elif mode == 'gen':
21+
count = int(args[3])
22+
Generator(name, db).generate(count)
23+
else:
24+
raise ValueError(usage)

parse.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import sqlite3
2+
import codecs
3+
import sys
4+
5+
class Parser:
6+
SENTENCE_START_SYMBOL = '^'
7+
SENTENCE_END_SYMBOL = '$'
8+
9+
def __init__(self, name, db):
10+
self.name = name
11+
self.db = db
12+
13+
def save_word_pair(self, word1, word2):
14+
self.db.add_word(word1, word2)
15+
16+
def parse(self, file_name):
17+
txt = codecs.open(file_name, 'r', 'utf-8').read()
18+
sentences = txt.split('\n')
19+
i = 0
20+
21+
for sentence in sentences:
22+
words = sentence.split()
23+
prev_word = Parser.SENTENCE_START_SYMBOL
24+
25+
for word in words:
26+
self.save_word_pair(prev_word, word)
27+
prev_word = word
28+
29+
self.save_word_pair(prev_word, Parser.SENTENCE_END_SYMBOL)
30+
self.db.commit()
31+
i += 1
32+
if i % 1000 == 0:
33+
print i
34+
sys.stdout.flush()
35+
36+

0 commit comments

Comments
 (0)