-
Notifications
You must be signed in to change notification settings - Fork 0
/
tweet_utils.py
99 lines (70 loc) · 3.11 KB
/
tweet_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from datetime import datetime
import string
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
# Gets the tweet time.
def get_time(tweet):
return datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S +0000 %Y")
# Gets all hashtags.
def get_hashtags(tweet):
return [tag['text'] for tag in tweet['entities']['hashtags']]
# Gets the screen names of any user mentions.
def get_user_mentions(tweet):
return [m['screen_name'] for m in tweet['entities']['user_mentions']]
def remove_urls(tweet_text):
slices = []
text = tweet_text
if 'urls' in tweet_text:
for url in tweet_text:
slices += [{'start': url['indices'][0], 'stop': url['indices'][1]}]
for s in slices:
text = text[:s['start']] + text[s['stop']:]
return text
# Gets the text, sans links, hashtags, mentions, media, and symbols.
def get_text_cleaned(tweet):
text = tweet['text']
slices = []
# Strip out the urls.
if 'urls' in tweet['entities']:
for url in tweet['entities']['urls']:
slices += [{'start': url['indices'][0], 'stop': url['indices'][1]}]
# Strip out the hashtags.
# if 'hashtags' in tweet['entities']:
# for tag in tweet['entities']['hashtags']:
# slices += [{'start': tag['indices'][0], 'stop': tag['indices'][1]}]
# Strip out the user mentions.
if 'user_mentions' in tweet['entities']:
for men in tweet['entities']['user_mentions']:
slices += [{'start': men['indices'][0], 'stop': men['indices'][1]}]
# Strip out the media.
if 'media' in tweet['entities']:
for med in tweet['entities']['media']:
slices += [{'start': med['indices'][0], 'stop': med['indices'][1]}]
# Strip out the symbols.
if 'symbols' in tweet['entities']:
for sym in tweet['entities']['symbols']:
slices += [{'start': sym['indices'][0], 'stop': sym['indices'][1]}]
# Sort the slices from highest start to lowest.
slices = sorted(slices, key=lambda x: -x['start'])
# No offsets, since we're sorted from highest to lowest.
for s in slices:
text = text[:s['start']] + text[s['stop']:]
return text
# Sanitizes the text by removing front and end punctuation,
# making words lower case, and removing any empty strings.
def get_text_sanitized(tweet):
return ' '.join([w.lower().strip().rstrip(string.punctuation) \
.lstrip(string.punctuation).strip() \
for w in get_text_cleaned(tweet).split() \
if w.strip().rstrip(string.punctuation).strip()])
# Gets the text, clean it, make it lower case, stem the words, and split
# into a vector. Also, remove stop words.
def get_text_normalized(tweet):
# Sanitize the text first.
text = get_text_sanitized(tweet).split()
# Remove the stop words.
text = [t for t in text if t not in stopwords.words('english')]
# Create the stemmer.
stemmer = LancasterStemmer()
# Stem the words.
return [stemmer.stem(t) for t in text]