-
Notifications
You must be signed in to change notification settings - Fork 0
/
webScraper.py
64 lines (56 loc) · 1.97 KB
/
webScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import urllib
import requests
from bs4 import BeautifulSoup
import html
from fake_useragent import UserAgent
from google import google
class StackoverflowWebScraper():
def __init__(self):
self.stackoverflow_base_api = "https://api.stackexchange.com/2.2"
self.time_delay = 5
self.ua = UserAgent()
# Uses Top Stackoverflow search results on Google Search.
def get_top_question(self, question):
search_results = google.search(question, 1)
for result in search_results:
# In case, the first search result isn't on stackoverflow, we search until we find one or reach end of results.
if "https://stackoverflow.com/questions/" in result.link:
# Grab the question id from the link
question_id = result.link.replace("https://stackoverflow.com/questions/", "").split("/")[0]
return {
"title": result.name,
"link": result.link,
"question_id": question_id,
}
# Found nothing
return None
def get_top_answer(self, question_id):
answer_request = requests.get(
url="{0}/questions/{1}/answers".format(self.stackoverflow_base_api, str(question_id)),
params={
"order": "desc",
"sort": "votes",
"site": "stackoverflow",
"filter": "withbody"
},
headers={
"user-agent": self.ua.random # Rotates fake user agents
}
)
answer_response = answer_request.json()
top_answer = answer_response["items"][0]
return top_answer
# Uses stackoverflow API, but search results were highly inaccurate. Commented out for now.
# def get_top_question(self, question):
# question_request = requests.get(
# url="{0}/search/advanced".format(self.stackoverflow_base_api),
# params={
# "order": "desc",
# "sort": "votes",
# "q": question,
# "site": "stackoverflow"
# }
# )
# question_response = question_request.json()
# top_question = question_response["items"][0]
# return top_question