-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_out_email_text.py
46 lines (30 loc) · 1.12 KB
/
parse_out_email_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/python
from nltk.stem.snowball import SnowballStemmer
import string
def parseOutText(f):
f.seek(0) ### go back to beginning of file (annoying)
all_text = f.read()
### split off metadata
content = all_text.split("X-FileName:")
words = ""
if len(content) > 1:
### remove punctuation
text_string = content[1].translate(string.maketrans("", ""), string.punctuation)
### project part 2: comment out the line below
#words = text_string
### split the text string into individual words, stem each word,
### and append the stemmed word to words (make sure there's a single
### space between each stemmed word)
words = ""
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
text_string_word = text_string.split()
for text in text_string_word:
words += stemmer.stem(text) + " "
return words
def main():
ff = open("../text_learning/test_email.txt", "r")
text = parseOutText(ff)
print text
if __name__ == '__main__':
main()