-
Notifications
You must be signed in to change notification settings - Fork 2
/
fetch_related.py
133 lines (115 loc) · 4.21 KB
/
fetch_related.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
from bs4 import BeautifulSoup
import urllib2
import time
import numpy as np
import re
import csv
import pandas as pd
import numpy as np
import sys
reload(sys)
sys.setdefaultencoding("utf8")
import xlsxwriter
f = False
def fetch(driver,dictionary,count,file_name):
import json
global f
answer_link_list = dictionary["answer_link_list"]
answer_upvote_list = []
workbook = xlsxwriter.Workbook("./"+str(file_name)+"/"+str(file_name)+"till_"+str(count)+"_answers.xlsx")
worksheet = workbook.add_worksheet()
row = 0
col = 0
worksheet.write(row,col,dictionary["ques"])
print("number of answers for question number "+str(count)+":"+str(len(answer_link_list)))
count_a = 0
if len(answer_link_list) > 0:
for answer in answer_link_list:
url = base_url+answer
wait_time = np.random.uniform(0.00,1.00,size = None)
time.sleep(wait_time)
# response = opener.open(url)
driver.get(url)
response = driver.page_source
soup_3= BeautifulSoup(response)
real_answers_list = soup_3.find_all("div",class_="ui_qtext_expanded")
if len(real_answers_list) == 0:
real_answers_list = soup_3.find_all("div",class_="ExpandedAnswer ExpandedQText")
# print(str("return--> ")+str(len(real_answers_list)))
answer_text = "no answer"
try:
for a in real_answers_list:
answer_text = str(a.find_all("span",class_="ui_qtext_rendered_qtext")[0].text)
print(answer_text)
except Exception as e:
print(e)
if answer_text == "no answer":
pass
# answer_upvote_list.append({"answer":answer_text,"upvote":float(10)})
else:
answer_upvote_list.append({"answer":answer_text})
count_a = count_a + 1
print("answer number "+str(count_a))
# SORT LIST OF ALL THE ANSWERS FOR EACH QUESTION ACCORDING TO THEIR UPVOTES
if len(answer_upvote_list) > 0:
sorted_list = list(reversed(answer_upvote_list))
# most_upvoted_answer = sorted_list[0]["answer"]
else:
sorted_list = [{"answer":"no_answer"}]
col = 0
# row_to_write = [dictionary["ques"]]
worksheet.write(row,col,dictionary["ques"])
for dict_ in sorted_list:
col = col + 1
# row_to_write.append(dict_["answer"])
worksheet.write(row,col,dict_["answer"])
# writer.writerow(row_to_write)
else:
worksheet.write(row,1,"no answer")
workbook.close()
# "./cardiology/Cardiology",
# list_ = ["./cardiology/Cardiologists","./cardiology/Cardiovascular-Fitness","./cardiology/Cardiovascular-Diseases"]
list_ = ["Mutual-Funds"]
for file_name in list_:
df = pd.read_excel(str(file_name)+"_links.xlsx",sheet_name="Sheet1")
questions = df.loc[:,0].values.tolist()[10]
links = df.loc[:,1].values.tolist()[10]
question_link ={}
driver = webdriver.Firefox()
base_url = "https://www.quora.com"
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
question_with_anwer_links_and_views = []
count = 10
driver.get(base_url)
time.sleep()
for key,link in zip(questions,links):
print("\n\n\n\n\n\n\n\n\n\n\n")
print(":::::::::::::::::::::::::::::::::::::::::::::::::::"+str(count))
link = link
if "unanswered" in link:
continue
driver.get(base_url+"/"+link)
html_source = driver.page_source
data = html_source.encode('utf-8')
soup = BeautifulSoup(data)
related_questions_tag = soup.find("div",class_="question_related list side_bar")
related_questions_links = related_questions_tag.find_all("a",class_='question_link')
related_questions_links = list(map(lambda x: x["href"],related_questions_links))
related_questions = related_questions_tag.find_all("span",class_='ui_qtext_rendered_qtext')
related_questions = list(map(lambda x: x.text,related_questions))
arr = [related_questions,related_questions_links]
df = pd.DataFrame(arr)
df = df.transpose()
df.to_excel("mf/"+str(key.replace("/"," "))+".xlsx")
count += 1