forked from K10ForTheWin/Text_Analysis_Technobabble
-
Notifications
You must be signed in to change notification settings - Fork 0
/
processWords.py
138 lines (108 loc) · 3.29 KB
/
processWords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import string
from shutil import copyfile
import os
import operator
from nltk.corpus import stopwords
import multidict as multidict
import numpy as np
import os
import re
from PIL import Image
from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt
my_path = os.path.abspath(os.path.dirname(__file__))
def get_all_stopwords(character_names=True):
stopwords =[]
for file in os.listdir("resources"):
with open(os.path.join(my_path, "resources", file)) as infile:
if file=="char_stopwords.txt":
if character_names==False:
pass
else:
words = []
w = [line.strip() for line in infile.readlines()]
stopwords.extend(words)
else:
words = []
w = [line.strip() for line in infile.readlines()]
stopwords.extend(words)
return list(set(stopwords))
stopwords_mine = get_all_stopwords()
def add_nltk_stopwords_to_set(stopwords_mine):
stopwords_nltk = set(stopwords.words('english'))
stopwords_all_inclusive = stopwords_mine + list(stopwords_nltk)
return stopwords_all_inclusive
stopwords_all_inclusive = add_nltk_stopwords_to_set(stopwords_mine)
root_keep = os.path.join(my_path, "data_char_lines_top_100")
os.chdir(root_keep)
def process_files():
for file in os.listdir(root_keep):
with open(os.path.join(root_keep, file)) as infile:
words = []
w = infile.readlines()
for line in w:
lexs = line.split(" ")
words.extend(lex)
words = filter_words(words)
words = remove_stopwords(words)
words_dict = count_words(words)
## print(len(words_dict.keys()))
## for k, v in words_dict.items():
## if v>10:
## print(k, v)
def filter_words(wordlist):
filteredwords = []
for w in wordlist:
w = w.lower()
if not w.isalpha():
w = ''.join([char for char in w if char not in string.punctuation])
## if w[-1:] in string.punctuation:
## w = w[:-1]
## elif w[0] in string.punctuation:
## w = w[1:]
filteredwords.append(w)
return filteredwords
def remove_stopwords(words):
stopwords_count = 0
filteredwords = []
for w in words:
if w not in stopwords_all_inclusive:
## if w not in stopwords_mine:
filteredwords.append(w)
elif w in stopwords_all_inclusive:
stopwords_count+=1
print(stopwords_count)
return filteredwords
def count_words(words):
d={}
for w in set(words):
d[w] = words.count(w)
return(d)
picard_words = {}
words = []
fname = ''
with open("PICARD.txt") as infile:
w = infile.readlines()
for line in w:
lexs = line.split(" ")
words.extend(lexs)
fname, ext = "PICARD.txt".split(".")
print("now processing")
words = filter_words(words)
words = remove_stopwords(words)
w_dict = count_words(words)
sd = sorted(w_dict.items(), key=lambda x: x[1], reverse=True)
def makeImage(w_dict, fname):
## alice_mask = np.array(Image.open("alice_mask.png"))
wc = WordCloud(background_color="white", max_words=1000)# mask=alice_mask)
# generate word cloud
wc.generate_from_frequencies(w_dict)
# show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()
plt.savefig('{}.png'.format(fname), bbox_inches='tight')
##for sdd in sd:
## print(sdd)
makeImage(w_dict, fname)