-
Notifications
You must be signed in to change notification settings - Fork 0
/
helper_prabowo_ml.py
83 lines (70 loc) · 2.46 KB
/
helper_prabowo_ml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# =========================TEXT PREPROCESSING===========================
# from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
# remove hashtags
def hashtags(text):
hash = re.findall(r"#(\w+)", text)
return hash
# translate emoji
# def emoji(text):
# for emot in UNICODE_EMOJI:
# if text == None:
# text = text
# else:
# text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",", "").replace(":", "").split()))
# return text
# remove retweet username and tweeted at @username
def remove_users(tweet):
'''Takes a string and removes retweet and @user information'''
tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)
# remove tweeted at
return tweet
# remove links
def remove_links(tweet):
'''Takes a string and removes web links from it'''
tweet = re.sub(r'http\S+', '', tweet) # remove http links
tweet = re.sub(r'bit.ly/\S+', '', tweet) # remove bitly links
tweet = tweet.strip('[link]') # remove [links]
return tweet
def clean_html(text):
html = re.compile('<.*?>')#regex
return html.sub(r'',text)
# remove non ascii character
def non_ascii(s):
return "".join(i for i in s if ord(i)<128)
def lower(text):
return text.lower()
# remove stopwords
def removeStopWords(str):
#select english stopwords
cachedStopWords = set(stopwords.words("english"))
#add custom words
cachedStopWords.update(('and','I','A','http','And','So','arnt','This','When','It','many','Many','so','cant','Yes','yes','No','no','These','these','mailto','regards','ayanna','like','email'))
#remove stop words
new_str = ' '.join([word for word in str.split() if word not in cachedStopWords])
return new_str
# remove email address
def email_address(text):
email = re.compile(r'[\w\.-]+@[\w\.-]+')
return email.sub(r'',text)
def punct(text):
token=RegexpTokenizer(r'\w+')#regex
text = token.tokenize(text)
text= " ".join(text)
return text
# remove digits and special characters
def remove_digits(text):
pattern = r'[^a-zA-z.,!?/:;\"\'\s]'
#when the ^ is on the inside of []; we are matching any character that is not included in this expression within the []
return re.sub(pattern, '', text)
def remove_special_characters(text):
# define the pattern to keep
pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
return re.sub(pat, '', text)
def remove_(tweet):
tweet = re.sub('([_]+)', "", tweet)
return tweet