-
Notifications
You must be signed in to change notification settings - Fork 0
/
test2.py
96 lines (85 loc) · 3.58 KB
/
test2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from general_functions import *
import time
import mysql.connector
import json
global_time = time.time()
c = open("config.json")
config = json.load(c)
MUID = 'fr8heaven_1_hashtagRecent_6_3708ca94'
try:
cnx = mysql.connector.connect(user=config["SQL"]["username"],
password=config["SQL"]["password"],
host=config["SQL"]["hostname"],
database=config["SQL"]["database"],
)
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exist")
else:
print(err)
else:
print("Looking for caption in MUID:", MUID)
cursor = cnx.cursor()
cursor.execute("SELECT * FROM data_media WHERE MUID IN ('%s') " % (MUID))
posts = cursor.fetchall()
print("MUID found :", len(posts))
post_list_str = ""
for post in posts:
#print(post[11])
post_list_str += post[11]
#feel slower
#print (post_list_str)
start_time = time.time()
# cl.login("betitoprendido3", "challenge/action/1")
s = """#SanDiegoBenching #TlajoGraff #ScottAirForceOne
#cherokeetag#NorthPekin#SanFrancisco#SanSebastianelGrande#BuenaVista#WestlakeVillage
#winnipegbench #graffiticholula #graffititoluca #canadabench#jaliscograffiti#benchguadalajara#bombasguadalajaramistrik#jasdjaws
#jawscaminojalisco#tlajomulco#guadalajaragraffiti"""
s = post_list_str
s = re.sub(r'#', r' #', s)
post_list_str = re.sub(r'#', r' #', post_list_str)
out = []
seen = set()
for word in post_list_str.split(" "):
if word not in seen:
print(word)
out.append(word)
seen.add(word)
# now out has "unique" tokens
unique_post_list_str = ""
for word in out:
print(word)
unique_post_list_str += word + " "
print("unique tokens:", len(out))
print("whole corpus:", len(post_list_str.split(" ")))
#doc = nlp(post_list_str)
doc = nlp(unique_post_list_str)
for token in doc:
if not token.is_space:
print(token.text, token.lemma_, token.pos_)
if token._.is_hashtag:
print(token.text, " - hashtag")
token_hashtag = re.sub(r'#', r'', token.text)
# print( len(cl.hashtag_info(token_hashtag)) )
# try:
# cl.hashtag_info(token_hashtag)
# print(token.text, " - ", cl.hashtag_info(token_hashtag).media_count)
# time.sleep(5)
# except:
# print("An exception occurred")
if token._.is_city:
print("City Hashtag -", token._.geo_hashtag, "countrycode -", token._.geo_countrycode)
elif token._.is_graffiti_lingo:
print("Graffiti Hashtag -", token._.graffiti_hashtag)
elif token._.is_railroad_lingo:
print("Railroad Hashtag -", token._.railroad_hashtag)
elif token._.is_mention:
token_mention = re.sub(r'@', r'', token.text)
# if cl.user_info_by_username(token_mention):
# print(token.text, " - ", cl.user_info_by_username(token_mention).biography)
# time.sleep(5)
print(token.text, " - arroba ")
print("post time --- %s seconds ---" % (time.time() - start_time))
print("total time --- %s seconds ---" % (time.time() - global_time))