This repository has been archived by the owner on Mar 16, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathepub-xml-fix-ncx-id.py
67 lines (51 loc) · 2.56 KB
/
epub-xml-fix-ncx-id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
from lxml import etree
# Get the current folder the script resides in.
current_folder = os.getcwd()
# Walk the whole folder tree under the current folder and stop at each node.
for root_folder, sub_folders, file_names in os.walk(current_folder):
print "Search for OPF files in %s ..." % root_folder
opf_file_name = None
ncx_file_name = None
# Search for both the OPF file and its NCX companion in the same current
# folder.
for file_name in file_names:
if file_name.endswith('.opf'):
print "OPF file found: %s" % file_name
opf_file_name = file_name
if file_name.endswith('.ncx'):
print "NCX file found: %s" % file_name
ncx_file_name = file_name
# Only start playing with XML content if and only if I have both an OPF
# *and* NCX file in the current folder.
if opf_file_name and ncx_file_name:
# Re-build the full string locating the XML file.
opf_file_path = root_folder + '/' + opf_file_name
ncx_file_path = root_folder + '/' + ncx_file_name
print "Full OPF path: %s" % opf_file_path
print "Full NCX path: %s" % ncx_file_path
# Open the OPF file
opened_file = open(opf_file_path, 'r')
# Now that we have an open OPF file, we can parse its XML content.
tree = etree.parse(opened_file)
# Browse the XML tree, searching for the value we care about.
package = tree.getroot()
metadata = package.find("{http://www.idpf.org/2007/opf}metadata")
identifier = metadata.find("{http://purl.org/dc/elements/1.1/}identifier")
identifier_value = identifier.text
print "ODF identifier is: %s" % identifier_value
# Now open and parse the NCX file.
opened_ncx_file = open(ncx_file_path, 'r')
tree = etree.parse(opened_ncx_file)
# Browse the XML tree, searching for the value we care about.
ncx = tree.getroot()
head = ncx.find("{http://www.daisy.org/z3986/2005/ncx/}head")
uid_meta = head.find("{http://www.daisy.org/z3986/2005/ncx/}meta[@name='dtb:uid']")
# Replace NCX file's meta content by the one we extracted from OPF.
uid_meta.set("content", identifier_value)
# Render the modified XML tree object into an XML string.
xml_string = etree.tostring(tree, xml_declaration=True, encoding='utf-8')
#xml_string.replace('/n', '/n/r')
# Save the resulting modification into a new file.
new_ncx_file_path = ncx_file_path + '-new'
open(new_ncx_file_path, 'w').write(xml_string)