-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollectchanges.py
executable file
·70 lines (61 loc) · 1.88 KB
/
collectchanges.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/python2
import os
from BeautifulSoup import BeautifulSoup
import codecs
import uuid
def get_docs_path():
docs_path = []
for root, dirs, files in os.walk('./therev'):
for file in files:
if file == 'document.html':
docs_path.append(os.path.join(root, file))
return docs_path
def extract_changes(file):
f = open(file)
raw_file = f.read()
f.close()
soup = BeautifulSoup(raw_file)
changes = ''
for node in soup.findAll(type='_DELETED_'):
if node.string:
changes = changes + node.string
return changes
def create_solr_add(changes, url):
solr_add = """
<add>
<field name="id">%ID%</field>
<field name="changes">%CHANGES%</field>
</add>
"""
if not url:
url = uuid.uuid1()
solr_add = solr_add.replace("%CHANGES%", changes)
solr_add = solr_add.replace("%ID%", str(url))
return solr_add
master_file = open('./therev/master.html')
master_file_raw = master_file.read()
master_soup = BeautifulSoup(master_file_raw)
master_anchors = master_soup.findAll('a')
total_links = open('./links/total', 'r')
total_links_raw = total_links.readlines()
def get_file_url(f):
for a in master_anchors:
if a['href'] == os.path.abspath(f):
href = a.parent.parent.findAll('a')[0]['href']
f_name = os.path.basename(href)
for line in total_links_raw:
if f_name in line:
return line
def generate_solr_post():
solr_post = '<doc>'
for f in get_docs_path():
changes = extract_changes(f)
solr_add = create_solr_add(changes, get_file_url(f))
solr_post = solr_post + solr_add
solr_post = solr_post + '</doc>\n'
output = codecs.open('solr_post.xml', 'w', 'utf-8')
output.write(solr_post)
output.close()
generate_solr_post()
master_file.close()
total_links.close()