-
Notifications
You must be signed in to change notification settings - Fork 0
/
OutboundLinks.py
130 lines (91 loc) · 4.68 KB
/
OutboundLinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import pandas as pd
import ssl
import requests
from collections import Counter
pd.options.mode.chained_assignment = None # Removes warning for copied dataframe
#Reserve keyowrds for URLS
RESERVED = ["gov","wiki"]
#Function to scrap outbound links for each url
def getOutboundLink(url):
headers = {"user-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0"}
#req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
#gcontext = ssl.SSLContext() # Only for gangstars
resp = requests.get(url, headers=headers,verify=False) #IMPORTANT : verify False for SSL error
#resp = urlopen(url)
soup = BeautifulSoup(resp.text,'lxml')
# a set to get inly unique values
OutboundLinks = set([])
for link in soup.find_all('a', href=re.compile("http")):
OutboundLinks.add(link['href'])
OutboundLinks = list(OutboundLinks)
#returns the list of all outbound links for url
return(OutboundLinks)
#Reads the input (output.csv) file
def scrapeURLS(filname):
#print("Reading file please wait ... ")
data = pd.read_csv(filname)
df = pd.DataFrame(data)
allPharases = set([])
for index,row in df.iterrows(): #dont remove index
# print(index,row['Related Terms'])
allPharases.add(row['Related Terms'])
allPharases = list(allPharases)
LinkPerURL = pd.DataFrame(columns=['Related Terms','URL', 'Outbound Links'])
for phrase in allPharases:
URLsForPhrase =df.loc[df['Related Terms'] == phrase, 'URL']
for i in range((URLsForPhrase.shape[0])-1):
url = df.loc[df['Related Terms'] == phrase, 'URL'].iloc[i]
#print("Processing for : " + phrase + url)
if any(x in url for x in RESERVED):
link = ' , '.join(url)
LinkPerURL = LinkPerURL.append({'Related Terms': phrase,'URL':url, 'Outbound Links' : link}, ignore_index=True)
else:
link = ' , '.join(getOutboundLink(url))
LinkPerURL = LinkPerURL.append({'Related Terms': phrase,'URL':url, 'Outbound Links' : link}, ignore_index=True)
LinkPerURL.to_csv("outbound.csv", encoding='utf-8',index=False)
def calculateOutboundLinks():
data = pd.read_csv("outbound.csv")
df = pd.DataFrame(data)
#Get those rows with keywords
reserveWordString = '|'.join(RESERVED)
reservedRecords = df[df['URL'].str.contains(reserveWordString)==True]
diff = df[~df.apply(tuple,1).isin(reservedRecords.apply(tuple,1))]
allPharases = set([])
dfObj = pd.DataFrame(columns = ['Related Terms' , 'Outbound Links'])
for index,row in diff.iterrows():
# print(index,row['Related Terms'])
allPharases.add(row['Related Terms'])
allPharases = list(allPharases)
for phrase in allPharases:
LinksPerTerm = []
OutboundLinksForPhrase =diff.loc[diff['Related Terms'] == phrase, 'Outbound Links']
for i in range((OutboundLinksForPhrase.shape[0])-1):
links = diff.loc[diff['Related Terms'] == phrase, 'Outbound Links'].iloc[i]
LinksPerTerm.append(links)
my_lst_str = ','.join(map(str, LinksPerTerm)) #Do check this
mylist = my_lst_str.split (",")
dictPerPhrase = Counter(mylist) #Counts the occurences of each element
for link, freq in dictPerPhrase.items():
if any(x in link for x in RESERVED): #check for wiki and gov
dfObj = dfObj.append({'Related Terms': phrase, 'Outbound Links': link}, ignore_index=True)
elif (freq == 2):
dfObj = dfObj.append({'Related Terms': phrase, 'Outbound Links': link}, ignore_index=True)
#print(dfObj)
#dfObj.drop_duplicates(subset ="Related Terms", keep = 'first' , inplace = True) #removing duplicates (Keeping first occurance)
reservedRecords = reservedRecords.drop(columns=['URL'])
result= dfObj.append(reservedRecords, ignore_index = True, sort=False)
#DATA CLEANING
result = result[result['Outbound Links'].str.contains('http')]
result = result.loc[:, ~result.columns.str.contains('^Unnamed')]
#Remove rows with column count more than 10 # delete rows with high freq
col = 'Related Terms'
n = 20
print(result[result.groupby(col)[col].transform('count').le(n)])
result = result[result.groupby(col)[col].transform('count').le(n)]
result.to_csv("Result.csv",encoding='utf-8-sig', index=False, header=True)
if __name__ == '__main__':
#scrapeURLS("output.csv")
calculateOutboundLinks()