-
Notifications
You must be signed in to change notification settings - Fork 0
/
imgur_download.py
175 lines (162 loc) · 8.22 KB
/
imgur_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import sys
import re
import urllib.request
import urllib.parse
import urllib.error
import urllib
import os
import string
import random
import time
import argparse
parser = argparse.ArgumentParser(description='Download images from reddit')
parser.add_argument("-l", help="Log number of downloaded files", action="store_true")
args = parser.parse_args()
home = os.getenv("HOME")
downloaded = '{0}/.config/imgur_down/downloaded.txt'.format(home)
SUBREDDITS = '{0}/.config/imgur_down/subreddits.txt'.format(home)
logfile = '{0}/.config/imgur_down/log.txt'.format(home)
download_dir = '{0}/Pictures/Imguralbums'.format(home)
imgur_albums_count = 0
picture_links_count = 0
gif_links_count = 0
# The pid for the lock file
pid = str(os.getpid())
pidfile = "/tmp/rid.pid"
def pid_exists(pid):
try:
os.kill(int(pid), 0)
return False
except OSError:
return True
def image_down(url, ext):
try:
urllib.request.urlretrieve(url, download_dir + '/' + i + '/' + time.strftime('%Y_%m_%d_%H_%M_%S') + ext)
except urllib.error.URLError:
print("WARNING: There was an exception downloading from a direct link (likely a 403 or 404)\n")
pass
# If the lock file exists we exit
if os.path.isfile(pidfile):
if pid_exists(open(pidfile).read()) is True:
print("Found stale lockfile, removing")
os.unlink(pidfile)
elif pid_exists(open(pidfile).read()) is False:
print("{0} already exists, exiting".format(pidfile))
sys.exit()
else:
print('Unable to write lockfile')
sys.exit()
with open(pidfile, mode='w') as f:
f.write(str(pid))
with open(SUBREDDITS) as f:
lines = f.read().splitlines()
print(lines)
for i in lines:
reddit_url = "https://reddit.com/r/{0}.json".format(i)
req = urllib.request.Request(reddit_url,
#The user-agent we send so we don't get massively limited
headers={'User-Agent': "AbotbyATGUNAT"})
print('Checking: {0}'.format(i))
reddit_call = urllib.request.urlopen(req)
result = reddit_call.read().decode('utf-8')
# https?://https?://m?.?imgur.com/a......
results = re.findall('https?://m?.?imgur.com/a......', result)
for image_url in results:
os.makedirs('{0}/{1}'.format(download_dir, i), exist_ok=True)
os.chdir('{0}/{1}'.format(download_dir, i))
if image_url not in open(downloaded).read():
os.system('imgurdl {0}'.format(image_url))
imgur_albums_count = imgur_albums_count + 1
# We write the downloaded url to a file so we can quickly skip already downloaded files
f = open(downloaded, 'a')
f.write(image_url + '\n')
f.close()
# We remove qoutes so the urls aren't qouted
result = result.replace('"', '')
# This regex is from https://stackoverflow.com/questions/169625/regex-to-check-if-valid-url-that-ends-in-jpg-png-or-gif
direct_links = re.findall('https?://(?:[a-z0-9\-]+\.)+[a-z]{2,6}(?:/[^/#?]+)+\.(?:jpg|gif|png|jpeg|gifv)', result)
# This regex with get all links to gfycat
gfycat_links = re.findall('https?://gfycat.com/\w*', result)
for gfy_links in gfycat_links:
char_set = string.ascii_uppercase + string.digits
# We need to get the mp4 from the link. We get the mp4 and not the gif to save on bandwidth
# We need to request the mp4 from *.gfycat.com instead of gfycat.com
down_gfy_link = gfy_links.replace('gfycat.com', 'giant.gfycat.com')
# This chanages http to https
down_gfy_link = down_gfy_link.replace('http://', 'https://')
down_gfy_link = down_gfy_link + '.mp4'
if gfy_links not in open(downloaded).read():
print('Downloading\n {0} in {1}/{2} as {3} \n'.format(down_gfy_link, download_dir, i, time.strftime('%Y_%m_%d_%H_%M_%S')))
try:
urllib.request.urlretrieve(down_gfy_link, download_dir + '/' + i + '/' + time.strftime('%Y_%m_%d_%H_%M_%S') +'.mp4')
gif_links_count = gif_links_count + 1
except urllib.error.URLError:
print("WARNING: There was an exception downloading from gfycat (likely a 403 or 404)\n")
try:
# Here we try to use the fat.gfycat.com server instead of the giant.gfycat.com. For some reason we get a
# 403 if we use the wrong one when we try the mp4
down_gfy_link = down_gfy_link.replace('giant', 'fat')
print('Trying fat.gfycat.com\n')
print('Downloading\n {0} in {1}/{2} as {3} \n'.format(down_gfy_link, download_dir, i, time.strftime('%Y_%m_%d_%H_%M_%S')))
urllib.request.urlretrieve(down_gfy_link, download_dir + '/' + i + '/' + time.strftime('%Y_%m_%d_%H_%M_%S') + '.mp4')
gif_links_count = gif_links_count + 1
except urllib.error.URLError:
# I'm not going to code all the back up servers right now because tracking them all down is a pain
print('fat.gfycat failed\n')
pass
print('Done!\n\n')
f = open(downloaded, 'a')
f.write(gfy_links + '\n')
f.close()
for d_image_url in direct_links:
# os.system("mkdir -p " + home + "/Pictures/Imguralbums/" + i)
os.makedirs('{0}/Pictures/Imguralbums/{1}'.format(home, i), exist_ok=True)
# The d_image_url not in open(downloaded).read() is what keeps this from redownloading images
# This stops us from downloading (some) thumbnails
# TODO clean this up.
if not re.findall('https?://i.redditmedia.com/............................................jpg', d_image_url) and not re.findall('https?://..thumbs.redditmedia.com/............................................jpg', d_image_url) and d_image_url not in open(downloaded).read():
print('Downloading\n {0} in {1}/{2} as {3}\n'.format(d_image_url, download_dir, i, time.strftime('%Y_%m_%d_%H_%M_%S')))
if '.' in d_image_url[-4:] and bool(re.search('https?://i.imgur.com/\w*.gif', d_image_url)) == False:
image_down(d_image_url, d_image_url[-4:])
picture_links_count = picture_links_count + 1
elif '.' in d_image_url[-4:] and bool(re.search('https?://i.imgur.com/\w*.gif', d_image_url)) == True:
# This checks if the gif is coming from imgur. If it is we change it from gif to mp4 to save on bandwidth
try:
d_image_url = d_image_url.replace('.gif', '.mp4')
image_down(d_image_url, '.mp4')
d_image_url = d_image_url.replace('.mp4', '.gif')
except urllib.error.URLError:
print("WARNING: There was an exception downloading from a direct link (likely a 403 or 404)\n")
pass
elif '.' in d_image_url[-5:]:
image_down(d_image_url, d_image_url[-5:])
picture_links_count = picture_links_count + 1
print('Done\n')
# We need to added downloaded urls to the list so we don't redownload them
f = open(downloaded, 'a')
f.write(d_image_url + '\n')
f.close()
for i in lines:
if os.path.isdir('{0}/{1}'.format(download_dir, i)) == True:
pass
elif os.path.isdir(download_dir + '/' + i) == False:
print('WARNING: {0}/{1} does not exist\nExiting'.format(download_dir, i))
sys.exit()
os.chdir('{0}/{1}'.format(download_dir, i))
# We use system calls instead of a zip lib because of laziness
# TODO change to zip lib so this can run on windows
os.system('for i in */; do zip -r "${i%/}.cbz" "$i" -x *.cbr; done')
os.system('rm -r */')
print('Done!')
total = picture_links_count + gif_links_count + imgur_albums_count
if args.l:
date_finished = time.strftime("[%d/%m/%Y_%H:%M:%S]")
with open(logfile, mode='a') as log:
print(date_finished + '\nDownloaded:\n{0} Pictures\n{1} Gyfs\n{2} Imgur Albums\n{3} Total items downloaded'.format(picture_links_count, gif_links_count, imgur_albums_count, total), file=log)
print('''Downloaded:\n
{0} Pictures\n
{1} Gyfs\n
{2} Imgur Albums\n
{3} Total items downloaded\n'''.format(picture_links_count, gif_links_count, imgur_albums_count, total))
# We remove the pid file
os.unlink(pidfile)