-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathloadcsv.py
60 lines (49 loc) · 1.97 KB
/
loadcsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import csv
import os
from decouple import config
from words.models import Word, Language, WordEntry
path = config('CSV_DATA_DIR')
os.chdir(path)
with open('wordlist.csv') as csvfile:
reader = csv.DictReader(csvfile)
print("Starting...")
english, new = Language.objects.get_or_create(name='en_us', display_name='US English')
if new:
print("Saving new language...")
english.name = 'en_us'
english.display_name = 'US English'
english.save()
words = {}
for row in reader:
word = dict(text=row['Word'], language=english, frequency=int(row['Frequency']))
if word['text'] not in words:
words[word['text']] = word
else:
words[word['text']]['frequency'] += int(word['frequency'])
words = words.values()
words = sorted(words, key=lambda w: w['frequency'], reverse=True)
existing_words_objects = {word.text: word for word in Word.objects.all()}
existing_words_text = [word.text for word in existing_words_objects.values()]
for word in words:
if word['text'] not in existing_words_text:
word['word'] = Word(text=word['text'])
word['word'].save()
else:
word['word'] = existing_words_objects[word['text']]
raw_words = [word['word'] for word in words if word['text'] not in existing_words_text]
for i in range(len(words)):
words[i]['rank'] = i + 1
for word in words:
print(word['rank'], word['text'], word['frequency'])
word_entries = []
for word in words:
word_entry = WordEntry(word=word['word'], language=english, frequency=word['frequency'], rank=word['rank'])
word_entries.append(word_entry)
if new:
WordEntry.objects.bulk_create(word_entries)
else:
print("Updating word entries not yet implemented!")
print("Language not modified!")
Word.calculate_all_bigram_weights()
english.create_bigram_entries()
english.calculate_total_word_occurrences()