-
Notifications
You must be signed in to change notification settings - Fork 0
/
case_folding.py
72 lines (35 loc) · 1.06 KB
/
case_folding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import pandas as pd
import string
import re
# In[ ]:
def load_data():
data = pd.read_csv(open('DatasetbaruID(1).csv', 'rU'),skiprows=1,names=['TweetID','Tweet_Author','Tweet_Reply'], engine='python')
return data
tweet_df = load_data()
tweet_df.head(3)
# In[ ]:
df = pd.DataFrame(tweet_df[['TweetID', 'Tweet_Author', 'Tweet_Reply']])
# In[ ]:
df = df.applymap(str)
# In[ ]:
pd.set_option('display.max_colwidth', 100)
# In[ ]:
def normalize(tweet):
tweet = re.sub(r"http\S+", "", tweet)
tweet = re.sub(r'-', ' ', tweet)
tweet = re.sub('[0-9]+', '', tweet)
tweet = re.sub('\W+',' ', tweet)
tweet = tweet.lower()
tweet = ''.join(["" if ord(i) < 32 or ord(i) > 126 else i for i in tweet])
tweet = re.sub('\s+', ' ', tweet)
tweet = tweet.strip()
tweet = re.sub(r'_', ' ', tweet)
return tweet
df = df[['Tweet_Author', 'Tweet_Reply']].applymap(lambda x: normalize(x))
df
# In[ ]:
df.to_csv('TweetBaru_clean_Final2.csv', index = False, header=True)
# In[ ]: