diff --git a/CHANGES.md b/CHANGES.md index 6d62d770..4bc9e41b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,6 +8,7 @@ - [pull #532] Fix #493 persisting when `code-friendly` extra enabled - [pull #535] Update `_slugify` to use utf-8 encoding (issue #534) - [pull #536] Maintain order of appearance in footnotes +- [pull #538] Include HTML headers in TOC ## python-markdown2 2.4.10 diff --git a/lib/markdown2.py b/lib/markdown2.py index 48c2b281..479663b0 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -240,6 +240,13 @@ def __init__(self, html4tags=False, tab_width=4, safe_mode=None, else: self._toc_depth = self.extras["toc"].get("depth", 6) + if 'header-ids' in self.extras: + if not isinstance(self.extras['header-ids'], dict): + self.extras['header-ids'] = { + 'mixed': False, + 'prefix': self.extras['header-ids'] + } + if 'break-on-newline' in self.extras: self.extras.setdefault('breaks', {}) self.extras['breaks']['on_newline'] = True @@ -424,6 +431,17 @@ def convert(self, text): text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow"\2', text) if "toc" in self.extras and self._toc: + if self.extras['header-ids'].get('mixed'): + # TOC will only be out of order if mixed headers is enabled + def toc_sort(entry): + '''Sort the TOC by order of appearance in text''' + return re.search( + # header tag, any attrs, the ID, any attrs, the text, close tag + r'^<(h%d).*?id=(["\'])%s\2.*>%s$' % (entry[0], entry[1], re.escape(entry[2])), + text, re.M + ).start() + + self._toc.sort(key=toc_sort) self._toc_html = calculate_toc_html(self._toc) # Prepend toc html to output @@ -783,6 +801,8 @@ def _hash_html_block_sub(self, match, raw=False): return ''.join(["\n\n", f_key, "\n\n", middle, "\n\n", l_key, "\n\n"]) + elif self.extras.get('header-ids', {}).get('mixed') and self._h_tag_re.match(html): + html = self._h_tag_re.sub(self._h_tag_sub, html) key = _hash_text(html) self.html_blocks[key] = html return "\n\n" + key + "\n\n" @@ -1786,6 +1806,13 @@ def header_id_from_text(self, text, prefix, n): return header_id + def _header_id_exists(self, text): + header_id = _slugify(text) + prefix = self.extras['header-ids'].get('prefix') + if prefix and isinstance(prefix, str): + header_id = prefix + '-' + header_id + return header_id in self._count_from_header_id + def _toc_add_entry(self, level, id, name): if level > self._toc_depth: return @@ -1810,6 +1837,7 @@ def _toc_add_entry(self, level, id, name): _h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M) def _h_sub(self, match): + '''Handles processing markdown headers''' if match.group(1) is not None and match.group(3) == "-": return match.group(1) elif match.group(1) is not None: @@ -1827,7 +1855,7 @@ def _h_sub(self, match): header_id_attr = "" if "header-ids" in self.extras: header_id = self.header_id_from_text(header_group, - self.extras["header-ids"], n) + self.extras["header-ids"].get('prefix'), n) if header_id: header_id_attr = ' id="%s"' % header_id html = self._run_span_gamut(header_group) @@ -1835,6 +1863,37 @@ def _h_sub(self, match): self._toc_add_entry(n, header_id, html) return "%s\n\n" % (n, header_id_attr, html, n) + _h_tag_re = re.compile(r''' + ^ # \1 tag num, \2 attrs + (.*) # \3 text + + ''', re.X | re.M) + + def _h_tag_sub(self, match): + '''Different to `_h_sub` in that this function handles existing HTML headers''' + text = match.string[match.start(): match.end()] + h_level = int(match.group(1)) + # extract id= attr from tag, trying to account for regex "misses" + id_attr = (re.match(r'.*?id=(\S+)?.*', match.group(2) or '') or '') + if id_attr: + # if id attr exists, extract that + id_attr = id_attr.group(1) or '' + id_attr = id_attr.strip('\'" ') + h_text = match.group(3) + + # check if header was already processed (ie: was a markdown header rather than HTML) + if id_attr and self._header_id_exists(id_attr): + return text + + # generate new header id if none existed + header_id = id_attr or self.header_id_from_text(h_text, self.extras['header-ids'].get('prefix'), h_level) + if "toc" in self.extras: + self._toc_add_entry(h_level, header_id, h_text) + if header_id and not id_attr: + # 'Header 1 + +

Header 2

+ +

Header 3

+ +

Header 4

+ +

Header 5

+ +
Header 6
diff --git a/test/tm-cases/mixed_header_ids.opts b/test/tm-cases/mixed_header_ids.opts new file mode 100644 index 00000000..6ee80acb --- /dev/null +++ b/test/tm-cases/mixed_header_ids.opts @@ -0,0 +1 @@ +{"extras": {"header-ids": {"mixed": True}, "toc": None}} diff --git a/test/tm-cases/mixed_header_ids.text b/test/tm-cases/mixed_header_ids.text new file mode 100644 index 00000000..49bb9474 --- /dev/null +++ b/test/tm-cases/mixed_header_ids.text @@ -0,0 +1,11 @@ +# Header 1 + +

Header 2

+ +# Header 3 + +

Header 4

+ +# Header 5 + +
Header 6
diff --git a/test/tm-cases/mixed_header_ids.toc_html b/test/tm-cases/mixed_header_ids.toc_html new file mode 100644 index 00000000..cd43e647 --- /dev/null +++ b/test/tm-cases/mixed_header_ids.toc_html @@ -0,0 +1,14 @@ +