-
Notifications
You must be signed in to change notification settings - Fork 4
/
scrape_img_pg_links.py
83 lines (66 loc) · 2.61 KB
/
scrape_img_pg_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
'''
Usage: python scrape_imagelinks.py --url <url of the race album you want to scrape>
'''
# import the necessary packages
from selenium import webdriver
import datetime
import time
import argparse
import os
#Define the argument parser to read in the URL
parser = argparse.ArgumentParser()
parser.add_argument('-url', '--url', help='URL to the online repository of images')
args = vars(parser.parse_args())
url = args['url']
#url = "https://www.myracephotos.in/Event-Photos/Kaveri-Trail-Marathon-2018/Kaveri-Trail-Marathon-2018/"
# Extract the dir name
album_name = url.split('/')[-2]
# Define Chrome options to open the window in maximized mode
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
# Initialize the Chrome webdriver and open the URL
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
# Define a pause time in between scrolls
pause_time = 2
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
# Record the starting time
start = datetime.datetime.now()
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# wait to load page
time.sleep(pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height: # which means end of page
break
# update the last height
last_height = new_height
# Record the end time, then calculate and print the total time
end = datetime.datetime.now()
delta = end-start
print("[INFO] Total time taken to scroll till the end {}".format(delta))
# Extract all anchor tags
link_tags = driver.find_elements_by_tag_name('a')
# Create an emply list to hold all the urls for the images
hrefs = []
# Extract the urls of only the images from each of the tag WebElements
for tag in link_tags:
if "sm-tile-content" not in tag.get_attribute('class'):
continue
hrefs.append(tag.get_attribute('href'))
#Create the directory after checking if it already exists or not
dir_name = 'img_pg_links'
if not os.path.exists(dir_name):
try:
os.mkdir(dir_name)
except OSError:
print ("[INFO] Creation of the directory {} failed".format(os.path.abspath(dir_name)))
else:
print ("[INFO] Successfully created the directory {} ".format(os.path.abspath(dir_name)))
# Write the links to the image pages to a file
f = open("{}/{}.csv".format(dir_name, album_name),'w')
f.write(",\n".join(hrefs))
print ("[INFO] Successfully created the file {}.csv with {} links".format(album_name, len(hrefs)))