-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
137 lines (122 loc) · 5.47 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import datetime
import http.cookiejar
import json
import re
import uuid
import urllib.parse, urllib.request, urllib.error
from pyquery import PyQuery
from tweets import Tweet
def get_tweets(search_params, current_position):
"""
Build search Query and get the tweets
:param search_params: SearchParams object
:param current_position: Min position where you want to retrieve the tweets from
:return: twitter json_data
"""
base_url = "https://twitter.com/i/search/timeline?f=tweets&q={}&src=typd&{}max_position={}"
query = ''
query = query + (' ' + search_params.search_query) if search_params.search_query else query
query = query + (' from:' + search_params.account_name) if search_params.account_name else query
query = query + (' since:' + search_params.since_date) if search_params.since_date else query
query = query + (' until:' + search_params.until_date) if search_params.until_date else query
lang = ('lang=' + search_params.language + '&') if search_params.language else ''
query = urllib.parse.quote(query)
base_url = base_url.format(query, lang, current_position)
print(base_url)
cookie_jar = http.cookiejar.CookieJar()
headers = [
('Host', "twitter.com"),
('User-Agent', "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"),
('Accept', "application/json, text/javascript, */*; q=0.01"),
('Accept-Language', "en-US;q=0.7,en;q=0.3"),
('X-Requested-With', "XMLHttpRequest"),
('Referer', base_url),
('Connection', "keep-alive")
]
attempts = 0
response = ''
while attempts < 10:
try:
if search_params.proxy:
print('Using IP {}'.format(search_params.proxy))
proxy = urllib.request.ProxyHandler({'http': search_params.proxy, 'https': search_params.proxy})
opener = urllib.request.build_opener(proxy, urllib.request.HTTPCookieProcessor(cookie_jar))
else:
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie_jar))
opener.addheaders = headers
response = opener.open(base_url)
break
except Exception:
attempts += 1
print('Retrying with different IP !!')
json_res = response.read()
json_data = json.loads(json_res.decode())
return json_data
def parse_json(search_params):
"""
Parse the json tweet
:param search_params: SearchParams object
:return: void
"""
min_position = get_last_search_position(search_params.log_file_name)
count = 0
while True:
json_res = get_tweets(search_params, min_position)
if len(json_res['items_html'].strip()) == 0:
break
min_position = json_res['min_position']
search_params.logging.info('min_pos - {}'.format(min_position))
item = json_res['items_html']
scraped_tweets = PyQuery(item)
scraped_tweets.remove('div.withheld-tweet')
tweets = scraped_tweets('div.js-stream-tweet')
for tweet_html in tweets:
print(count)
tweet_py_query = PyQuery(tweet_html)
name = tweet_py_query.attr("data-name")
screen_name = tweet_py_query.attr("data-screen-name")
tweet_id = tweet_py_query.attr("data-tweet-id")
tweet_text = re.sub(r"\s+", " ",
tweet_py_query("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@'))
tweet_date_time = int(tweet_py_query("small.time span.js-short-timestamp").attr("data-time"))
tweet_date_time = datetime.datetime.fromtimestamp(tweet_date_time)
retweet_count = int(tweet_py_query("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr(
"data-tweet-stat-count").replace(",", ""))
favorites_count = int(
tweet_py_query("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr(
"data-tweet-stat-count").replace(",", ""))
permalink = 'https://twitter.com' + tweet_py_query.attr("data-permalink-path")
tweet = Tweet(str(uuid.uuid4()), name, screen_name, tweet_id, tweet_text, tweet_date_time, retweet_count,
favorites_count, permalink)
# Now Write to OP or save to DB
write_op(search_params.op, tweet)
count += 1
# sleep(5)
if 0 < search_params.max_retrieval_count <= count:
break
def write_op(op_file, tweet):
"""
Writing tweets to some output file
:param op_file: op_file name
:param tweet: Tweet object
:return: void
"""
with open(op_file, 'a+', encoding='utf-8') as f:
# UUID, tweet_id, user_name, screen_name, tweet, date_time, retweet_count, fav_count, link
f.write(
('%s;%s;%s;%s;%s;%s;%d;%d;%s\n' % (tweet.uuid, tweet.tweet_id, tweet.name, tweet.screen_name, tweet.tweet,
tweet.date_time.strftime("%Y-%m-%d %H:%M"), tweet.retweet_count,
tweet.favourites_count, tweet.link)))
def get_last_search_position(logger_file):
"""
Required for resuming the previous search operation
:param logger_file: Logger file name
:return: Last position id
"""
with open(logger_file, 'r+') as f:
lines = f.read().splitlines()
try:
last_pos = lines[-1].split(' - ')[1]
except IndexError:
last_pos = ''
return last_pos