forked from thewca/tnoodle
-
Notifications
You must be signed in to change notification settings - Fork 0
/
unifyhtml.py
81 lines (70 loc) · 2.46 KB
/
unifyhtml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python3
import argparse
import urllib.request, urllib.parse, urllib.error
import urllib.parse
import os
import re
import sys
import time
def getRelativeUrl(baseUrl, url):
split = urllib.parse.urlparse(url)
if split.scheme:
pass
else:
split = list(split)
splitBase = urllib.parse.urlparse(baseUrl)
split[0] = splitBase[0]
split[1] = splitBase[1]
if not split[2].startswith('/'):
split[2] = splitBase[2] + split[2]
return urllib.parse.urlunparse(split)
def findAndInline(html, baseUrl, regex, template):
unifiedHtml = ""
lastMatch = 0
for match in regex.finditer(html):
url = match.group(1)
fullUrl = getRelativeUrl(baseUrl, url)
filename, headers = urllib.request.urlretrieve(fullUrl)
contents = open(filename).read()
unifiedHtml += html[lastMatch:match.start()]
unifiedHtml += template % ( url, contents )
lastMatch = match.end()
unifiedHtml += html[lastMatch:]
return unifiedHtml
def unify(url, try_count=1):
for nthTry in range(try_count):
try:
return unify_impl(url)
except IOError:
if nthTry + 1 == try_count:
sys.stderr.write("Connecting to %s failed on attempt #%s, giving up\n" % ( url, try_count ))
raise
# We're going to try again, but first, lets wait a second
time.sleep(1)
def unify_impl(url):
filename, headers = urllib.request.urlretrieve(url)
ogHtml = open(filename).read()
pieces = urllib.parse.urlparse(url)
pieces = list(pieces)
pieces[2] = os.path.dirname(pieces[2]) + '/' # path
baseUrl = urllib.parse.urlunparse(pieces)
internalCssTemplate = """<style type="text/css">
/************** %s ***************/
%s
</style>"""
cssRe = re.compile('<link type="text/css" rel="stylesheet" href="([^"]*)" media="screen" />')
ogHtml = findAndInline(ogHtml, baseUrl, cssRe, internalCssTemplate)
internalJsTemplate = """<script type="text/javascript">
/************** %s ***************/
%s
</script>"""
jsRe = re.compile('<script type="text/javascript" src="([^"]*)"></script>')
ogHtml = findAndInline(ogHtml, baseUrl, jsRe, internalJsTemplate)
return ogHtml
def main():
parser = argparse.ArgumentParser()
parser.add_argument('url', help='Url to unify to a single html file')
args = parser.parse_args()
print(unify(args.url))
if __name__ == "__main__":
main()