-
Notifications
You must be signed in to change notification settings - Fork 1
/
file-validation.py
58 lines (51 loc) · 2.01 KB
/
file-validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
import re
import time
import random
import requests
from bs4 import BeautifulSoup
proxies = {
'http': 'socks5://127.0.0.1:1080'}
base_url = 'http://papers.nips.cc/'
data_path = os.path.join(os.getcwd(), 'data')
year_dir = [year for year in os.listdir(data_path) if os.path.isdir(
os.path.join(data_path, year))]
for year in year_dir:
parent = os.path.join(data_path, year)
intro_dir = os.path.join(parent, 'intro')
txt_dir = os.path.join(parent, 'txt')
# open toc file, get list of introduction page links
toc_path = os.path.join(data_path, 'papers-' + year + '.html')
with open(toc_path, 'r') as toc_file:
toc_parsed = BeautifulSoup(toc_file.read(), 'lxml')
paper_links = [a['href'][7:] for a in toc_parsed.find_all(
href=re.compile("/paper.+"))]
# validate intro file
for paper in paper_links:
intro_file = os.path.join(intro_dir, paper + '.html')
if not os.path.isfile(intro_file):
print(intro_file + ' not exist!')
with open(intro_file, 'r') as ifile:
intro_parsed = BeautifulSoup(ifile.read(), 'lxml')
title = intro_parsed.title
if not title:
print(intro_file + ' no title!')
#title = title.string
authors = [author.string for author in intro_parsed.find_all(
'li', 'author')]
retry = 3
while (not len(authors) or not title) and retry:
url = base_url + 'paper/' + paper
#intro_page = requests.get(url, proxies=proxies)
intro_page = requests.get(url)
intro_parsed = BeautifulSoup(intro_page.text, 'lxml')
title = intro_parsed.title
authors = [author.string for author in intro_parsed.find_all(
'li', 'author')]
time.sleep(5 + random.randint(0, 5))
retry -= 1
if len(authors) and title and retry < 3:
with open(intro_file, 'w') as ifile:
ifile.write(intro_page.text)
if not retry:
print(url)