-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
49 lines (41 loc) · 1.26 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
'''
install request, BeautifulSoup or bs4 and pandas library
'''
# imports and variables
import requests
import pandas as pd
from bs4 import BeautifulSoup
csvData = {}
serial_no = 0
# url input for scrapping
url = input('Please enter wikipedia URL: ')
print('requesting to website data....')
# url = 'https://en.wikipedia.org/wiki/Main_Page'
r = requests.get(url)
htmlContent = r.content
soup = BeautifulSoup(htmlContent, 'html.parser')
anchor = soup.find_all('a')
print('looking for href data....')
# loop for getting all of the href links
for link in anchor:
data = str(link.get('href'))
serial_no+=1
if str(link.text) == '':
text = link.get('title')
else:
text = link.text
if data.startswith('/'):
hrefLink = 'https://en.wikipedia.org' + data
elif data.startswith('//'):
hrefLink = 'https:' + data
elif data.startswith('#'):
hrefLink = url+data
else:
hrefLink = data
# storing data into csvData dictionary
csvData[serial_no] = [text,hrefLink]
# writing data to csv file using pandas library
data_df = pd.DataFrame.from_dict(csvData, orient='index', columns=['textContent', 'link'])
name_of_files = input("enter then file name: ")
data_df.to_csv(name_of_files+'.csv')
print('scrapping done')