-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_xkcd.py
29 lines (26 loc) · 1.12 KB
/
scrape_xkcd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import requests
from bs4 import BeautifulSoup
import os
import urllib
import time
initial_url = 'https://xkcd.com/'
next_url = ''
page_request = requests.get( initial_url )
while( page_request.status_code != 404 ):
if next_url == '':
current_page = requests.get( initial_url )
else:
current_page = requests.get( next_url )
html_content = current_page.text
soup_obj = BeautifulSoup( html_content,'lxml' )
alt_text = soup_obj.find( 'div', {'id':'comic'} ).img['title'].encode('ascii','ignore').decode()
comic_image_url = 'http:'+ soup_obj.find( 'div',{'id':'comic'} ).img['src']
next_comic_number = soup_obj.find( 'ul',{'class':'comicNav'} ).find_all('a')[1]['href'].replace( '/', '' )
comic_path_on_disk = str( int( next_comic_number ) + 1 )
os.mkdir( comic_path_on_disk )
alt_text_file = open( ( os.path.join( os.getcwd(), comic_path_on_disk ) ) + '/' + comic_path_on_disk + '.txt', 'w' )
alt_text_file.write( alt_text )
alt_text_file.close()
urllib.urlretrieve( comic_image_url, os.path.join( os.getcwd(), comic_path_on_disk ) + '/' + comic_path_on_disk +'.png' )
next_url = initial_url + next_comic_number
time.sleep( 0.8 )