diff --git a/CHANGES.md b/CHANGES.md
index 6d62d770..4bc9e41b 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -8,6 +8,7 @@
- [pull #532] Fix #493 persisting when `code-friendly` extra enabled
- [pull #535] Update `_slugify` to use utf-8 encoding (issue #534)
- [pull #536] Maintain order of appearance in footnotes
+- [pull #538] Include HTML headers in TOC
## python-markdown2 2.4.10
diff --git a/lib/markdown2.py b/lib/markdown2.py
index 48c2b281..479663b0 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -240,6 +240,13 @@ def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
else:
self._toc_depth = self.extras["toc"].get("depth", 6)
+ if 'header-ids' in self.extras:
+ if not isinstance(self.extras['header-ids'], dict):
+ self.extras['header-ids'] = {
+ 'mixed': False,
+ 'prefix': self.extras['header-ids']
+ }
+
if 'break-on-newline' in self.extras:
self.extras.setdefault('breaks', {})
self.extras['breaks']['on_newline'] = True
@@ -424,6 +431,17 @@ def convert(self, text):
text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow"\2', text)
if "toc" in self.extras and self._toc:
+ if self.extras['header-ids'].get('mixed'):
+ # TOC will only be out of order if mixed headers is enabled
+ def toc_sort(entry):
+ '''Sort the TOC by order of appearance in text'''
+ return re.search(
+ # header tag, any attrs, the ID, any attrs, the text, close tag
+ r'^<(h%d).*?id=(["\'])%s\2.*>%s\1>$' % (entry[0], entry[1], re.escape(entry[2])),
+ text, re.M
+ ).start()
+
+ self._toc.sort(key=toc_sort)
self._toc_html = calculate_toc_html(self._toc)
# Prepend toc html to output
@@ -783,6 +801,8 @@ def _hash_html_block_sub(self, match, raw=False):
return ''.join(["\n\n", f_key,
"\n\n", middle, "\n\n",
l_key, "\n\n"])
+ elif self.extras.get('header-ids', {}).get('mixed') and self._h_tag_re.match(html):
+ html = self._h_tag_re.sub(self._h_tag_sub, html)
key = _hash_text(html)
self.html_blocks[key] = html
return "\n\n" + key + "\n\n"
@@ -1786,6 +1806,13 @@ def header_id_from_text(self, text, prefix, n):
return header_id
+ def _header_id_exists(self, text):
+ header_id = _slugify(text)
+ prefix = self.extras['header-ids'].get('prefix')
+ if prefix and isinstance(prefix, str):
+ header_id = prefix + '-' + header_id
+ return header_id in self._count_from_header_id
+
def _toc_add_entry(self, level, id, name):
if level > self._toc_depth:
return
@@ -1810,6 +1837,7 @@ def _toc_add_entry(self, level, id, name):
_h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)
def _h_sub(self, match):
+ '''Handles processing markdown headers'''
if match.group(1) is not None and match.group(3) == "-":
return match.group(1)
elif match.group(1) is not None:
@@ -1827,7 +1855,7 @@ def _h_sub(self, match):
header_id_attr = ""
if "header-ids" in self.extras:
header_id = self.header_id_from_text(header_group,
- self.extras["header-ids"], n)
+ self.extras["header-ids"].get('prefix'), n)
if header_id:
header_id_attr = ' id="%s"' % header_id
html = self._run_span_gamut(header_group)
@@ -1835,6 +1863,37 @@ def _h_sub(self, match):
self._toc_add_entry(n, header_id, html)
return "%s\n\n" % (n, header_id_attr, html, n)
+ _h_tag_re = re.compile(r'''
+ ^ # \1 tag num, \2 attrs
+ (.*) # \3 text
+
+ ''', re.X | re.M)
+
+ def _h_tag_sub(self, match):
+ '''Different to `_h_sub` in that this function handles existing HTML headers'''
+ text = match.string[match.start(): match.end()]
+ h_level = int(match.group(1))
+ # extract id= attr from tag, trying to account for regex "misses"
+ id_attr = (re.match(r'.*?id=(\S+)?.*', match.group(2) or '') or '')
+ if id_attr:
+ # if id attr exists, extract that
+ id_attr = id_attr.group(1) or ''
+ id_attr = id_attr.strip('\'" ')
+ h_text = match.group(3)
+
+ # check if header was already processed (ie: was a markdown header rather than HTML)
+ if id_attr and self._header_id_exists(id_attr):
+ return text
+
+ # generate new header id if none existed
+ header_id = id_attr or self.header_id_from_text(h_text, self.extras['header-ids'].get('prefix'), h_level)
+ if "toc" in self.extras:
+ self._toc_add_entry(h_level, header_id, h_text)
+ if header_id and not id_attr:
+ # 'Header 1
+
+
+
+
+
+
+
+
+
+Header 6
diff --git a/test/tm-cases/mixed_header_ids.opts b/test/tm-cases/mixed_header_ids.opts
new file mode 100644
index 00000000..6ee80acb
--- /dev/null
+++ b/test/tm-cases/mixed_header_ids.opts
@@ -0,0 +1 @@
+{"extras": {"header-ids": {"mixed": True}, "toc": None}}
diff --git a/test/tm-cases/mixed_header_ids.text b/test/tm-cases/mixed_header_ids.text
new file mode 100644
index 00000000..49bb9474
--- /dev/null
+++ b/test/tm-cases/mixed_header_ids.text
@@ -0,0 +1,11 @@
+# Header 1
+
+Header 2
+
+# Header 3
+
+Header 4
+
+# Header 5
+
+Header 6
diff --git a/test/tm-cases/mixed_header_ids.toc_html b/test/tm-cases/mixed_header_ids.toc_html
new file mode 100644
index 00000000..cd43e647
--- /dev/null
+++ b/test/tm-cases/mixed_header_ids.toc_html
@@ -0,0 +1,14 @@
+