-
Notifications
You must be signed in to change notification settings - Fork 0
/
scopus_scrape.py
54 lines (50 loc) · 1.99 KB
/
scopus_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
import os
#Extract individual aptamer targets and urls
#Uses url as key, target as value
page_title_abstract = {}
year_start = 1991
year_end = 2023
for year in range(year_start, year_end+1):
files = os.listdir("./Pages/" + str(year))
pages = 0
for file in files:
if ".html" in file:
pages += 1
for page in range(1, 1+pages):
print(year, page)
filename = './Pages/' + str(year) + "/" + str(page) + '.html'
with open(filename, 'r') as f:
contents = f.read()
soup = BeautifulSoup(contents, 'html.parser')
titles = []
abstracts = []
for title in soup.findAll(class_='ddmDocTitle'):
title_text = title.text
title_text = title_text.replace('\n', ' ')
title_text = title_text.replace('\t', ' ')
titles.append(title.text)
for j in range(1, len(titles)):
abstract_div = soup.find(id="previewAbstract"+str(j))
abstract = abstract_div.find(class_="txt")
if abstract is None:
abstract = abstract_div.text
else:
abstract = abstract.text
if abstract is not None:
abstract = abstract.replace('\n', ' ')
abstract = abstract.replace('\t', ' ')
abstracts.append(abstract)
title_abstract_pairs = []
for i in range(0, len(abstracts)):
title_abstract_pairs.append([titles[i], abstracts[i]])
page_title_abstract[(year, page)] = title_abstract_pairs
with open("output.csv", "w", encoding="utf8") as f:
f.write("Year\tPage\tTitle\tAbstract\n")
keys = list(page_title_abstract.keys())
keys.sort()
for k in keys:
for paper in page_title_abstract[k]:
f.write(str(k[0]) + "\t" + str(k[1]) + "\t" + paper[0] + "\t" + paper[1] + "\n")