-
Notifications
You must be signed in to change notification settings - Fork 1
/
page_watch.py
211 lines (187 loc) · 6.34 KB
/
page_watch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import sys
import os
import hashlib
import tempfile
import difflib
"""
This file contains a simple program to store a list of static webpages,
and ot fetch them to check for differences since the first save. It uses a
linux style .pagewatch directionr in the users home dir in order to save
the list of files and cached origional version.
"""
#boilerplate metadata block
__author__ = "Far McKon"
__copyright__ = "Copyright 2011, Far McKon Light Industries"
__license__ = "GPL"
__version__ = "1.0.1.0"
__maintainer__ = "Far McKon"
__email__ = "[email protected]"
pagewatch_list_filename = os.path.expanduser("~/.pagewatch/watchlist.txt")
pagewatch_old_data_dir = os.path.expanduser("~/.pagewatch/old_pages")
def dircheck_and_create(dirname):
""" takes a directory name (NOT a filename) checks if it exists, or if it needs
to be created, prompts user for creation
"""
if( not os.path.exists(dirname) ):
var = raw_input("May we create " + dirname +" settings dir for you (yes/no)?")
if(var == 'y' or var == 'yes' or var == 'Y' ):
os.mkdir( dirname )
return True
elif(var == 'n' or var == 'no' or var == 'N' ):
print "cannot contiune. Goodbye"
return False
print "you entered ", var , "which I did not understand. Goodbye"
return False
print "dir exists"
return True
def filecheck_and_create(filename):
""" takes a file name and checks if it exists, or if it needs
to be created, prompts user for creation
""" if( not os.path.isfile(filename) ):
var = raw_input("May we create " + filename +" file for you (yes/no)?")
if(var == 'y' or var == 'yes' or var == 'Y' ):
open(filename, 'w').close()
return True
elif(var == 'n' or var == 'no' or var == 'N' ):
print "cannot contiune. Goodbye"
return False
print "you entered ", var , "which I did not understand. Goodbye"
return False
return True
def wget(url, filename=None):
""" Fetches a url wget style. if filename is not specified, a tmpfile is created.
returns None, or the temp filename used to store the data
"""
import urllib2
try:
opener1 = urllib2.build_opener()
page1 = opener1.open(url)
data = page1.read()
except Exception:
print "wget fail"
return None
tmp_filename = filename
if(tmp_filename == None):
tmp_filename = tempfile.mkstemp()[1] #tmpnam, but safe
print tmp_filename # test
fout = open(tmp_filename, "wb")
fout.write(data)
fout.close()
return tmp_filename
def md5sum(fileName, excludeLine="", includeLine=""):
"""Compute md5 hash of the specified file. exclude line is a
pattern that is use to discard any line that starts iwth that pattern,
include line is concatenated to to end of the passed file before an md5 is created
"""
#thanks to http://thejaswihr.blogspot.com/2008/06/python-md5-checksum-of-file.html
# for this function
m = hashlib.md5()
try:
fd = open(fileName,"rb")
except IOError:
print "Unable to open the file in readmode:", filename
return
content = fd.readlines()
fd.close()
for eachLine in content:
if excludeLine and eachLine.startswith(excludeLine):
continue
m.update(eachLine)
m.update(includeLine)
return m.hexdigest()
def run_check():
""" Checks the url watchlist for changes.
returns 'False' if a test was not performed
otherwise returns a dict of text of diff's,
None is a valid value indicating tests were run, but no differnece were found"""
diffsDir = {}
check_file = pagewatch_list_filename
ok = dircheck_and_create( os.path.dirname(check_file) )
if(ok == False):
return False
ok = filecheck_and_create(check_file)
if(ok == False):
return False
#Load a txt file
fh = open(check_file, 'r')
lines = fh.readlines()
for line in lines:
if line[0] == '#' or line[0] == '\n': #comment,space,EOF line
continue
(url, md5, old_ver_filename) = line.split(' : ') #split into 'url md5 filename', ' : ' delinated
print 'url: ', url
print 'md5: ', md5
print 'old version zip: ', old_ver_filename
#wget that url to a tmp file
tmp_filename = wget(url)
if(tmp_filename == None):
print "run_check fail to fetch " , url ," (tmp_filename " , tmp_filename ,")"
return False
#if new md5 != md5 in file line
md5_of_tmp = md5sum(tmp_filename)
print "md5 of tmp: ", str(md5_of_tmp)
if( str(md5_of_tmp) == str(md5) ):
#print "md5 match!"
continue
else :
#print "no md5 match!"
old_fn = os.path.expanduser(old_ver_filename.strip()) #read from file, do expanduser
old_fh = open(old_fn, 'rb')
old_data = old_fh.readlines()
new_fh = open(tmp_filename, 'rb')
new_data = new_fh.readlines()
differ = difflib.HtmlDiff()
#unzip old file, return a diff of the two files.
#update md5 file, add new url entry
diffsDir[url] = differ.make_file(old_data,new_data)
#print diffsDir[url]
print 'url: ', url
#return the dictionry of diff's
return diffsDir
#pagewatch_add
def add_page( url ):
""" This funciton adds a page to the pagewatch list,
and caches the baseline version of that page for future diffs
"""
# wget the url
print "getting url ", url
tmp_filename = wget(url)
if(tmp_filename == None):
print "add_page, fail to fetch " , url ," (tmp_filename " , tmp_filename ,")"
return False
md5 = md5sum(tmp_filename)
print "content md5 ", sum
#check if we can drop a tmpfile in there
dircheck_and_create(pagewatch_old_data_dir)
print "pagewatch_old_data_dir", pagewatch_old_data_dir
cache_filename = tempfile.mkstemp(dir=pagewatch_old_data_dir)[1] #tmpnam, but safe
print "cache file ", cache_filename
#copy our page into our cache
fh = open(cache_filename, "wb");
fh.write( open(tmp_filename, 'rb').read() )
#
list_fh = open(pagewatch_list_filename, 'a')
list_fh.write(" : ".join([url, md5, cache_filename]) )
list_fh.write('\n')
list_fh.close()
if __name__ == "__main__":
"""
Standard main function, only runs if the module was called from the commnad line
"""
print " Pagewatch to the rescue!"
#print sys.argv
#TODO add real options parsing in the future
if( len(sys.argv) < 2):
#print "looking for changes"
diffsDir = run_check()
if( diffsDir == False):
print "Error in checking the pages available. "
exit(-1)
if( diffsDir == None) :
print "No changes detected, nothing to report "
exit(0)
else:
print "We found ", len(diffsDir), " changes in files"
elif( len(sys.argv) >= 2):
print "adding page" , sys.argv[1]
add_page(sys.argv[1])