From d01c5b71f5729c8868b272b37367dcc19c854a2f Mon Sep 17 00:00:00 2001 From: Agathe Porte Date: Fri, 17 Nov 2023 17:19:52 +0100 Subject: [PATCH] Introduce SUMMARY_MAX_PARAGRAPHS Let the users use only the n-first paragraphs of the article as article summary. The advantage of this approach is that we avoid the random word count ellipsis that will cut content in pieces, while not having to copy the first paragraph of the article into the article's summary metadata. If both SUMMARY_MAX_PARAGRAPHS and SUMMARY_MAX_LENGTH are set, then the SUMMARY_MAX_LENGTH option will apply to the number of paragraphs in SUMMARY_MAX_PARAGRAPHS. --- RELEASE.md | 3 +++ docs/content.rst | 5 ++++- docs/settings.rst | 8 ++++++++ pelican/contents.py | 12 +++++++++++- pelican/tests/test_contents.py | 25 +++++++++++++++++++++++++ pelican/tests/test_utils.py | 17 +++++++++++++++++ pelican/utils.py | 19 +++++++++++++++++++ 7 files changed, 87 insertions(+), 2 deletions(-) create mode 100644 RELEASE.md diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 0000000000..c4272ec5b0 --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,3 @@ +Release type: minor + +Allow users to use n-first paragraphs as article summary. diff --git a/docs/content.rst b/docs/content.rst index cacacea9aa..dcb9930968 100644 --- a/docs/content.rst +++ b/docs/content.rst @@ -162,7 +162,10 @@ author you can use ``author`` field. If you do not explicitly specify summary metadata for a given post, the ``SUMMARY_MAX_LENGTH`` setting can be used to specify how many words from the -beginning of an article are used as the summary. +beginning of an article are used as the summary. You can also use the first N +paragraphs from the post as a summary using the ``SUMMARY_MAX_PARAGRAPHS`` +setting. If both options are in use, the specified number of paragraphs will +be used but may be truncated to respect the specified max length. You can also extract any metadata from the filename through a regular expression to be set in the ``FILENAME_METADATA`` setting. All named groups diff --git a/docs/settings.rst b/docs/settings.rst index e9edffde0e..d0c14d043a 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -308,6 +308,14 @@ Basic settings does not otherwise specify a summary. Setting to ``None`` will cause the summary to be a copy of the original content. +.. data:: SUMMARY_MAX_PARAGRAPHS = 1 + + When creating a short summary of an article, this will be the number of + paragraphs to use as the summary. This only applies if your content + does not otherwise specify a summary. Setting to ``None`` will cause the + summary to use the whole text (up to ``SUMMARY_MAX_LENGTH``) instead of just + the first N paragraphs. + .. data:: SUMMARY_END_SUFFIX = '…' When creating a short summary of an article and the result was truncated to diff --git a/pelican/contents.py b/pelican/contents.py index 474e5bbfea..51e89123af 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -24,6 +24,7 @@ sanitised_join, set_date_tzinfo, slugify, + truncate_html_paragraphs, truncate_html_words, ) @@ -431,8 +432,17 @@ def get_summary(self, siteurl): if "summary" in self.metadata: return self.metadata["summary"] + content = self.content + if ( + "SUMMARY_MAX_PARAGRAPHS" in self.settings + and self.settings["SUMMARY_MAX_PARAGRAPHS"] is not None + ): + content = truncate_html_paragraphs( + self.content, self.settings["SUMMARY_MAX_PARAGRAPHS"] + ) + if self.settings["SUMMARY_MAX_LENGTH"] is None: - return self.content + return content return truncate_html_words( self.content, diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index 9dc7b70d71..96890fecca 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -117,6 +117,31 @@ def test_summary_max_length(self): page = Page(**page_kwargs) self.assertEqual(page.summary, "") + def test_summary_paragraph(self): + # If a :SUMMARY_MAX_PARAGRAPHS: is set, the generated summary should + # not exceed the given paragraph count. + page_kwargs = self._copy_page_kwargs() + settings = get_settings() + page_kwargs["settings"] = settings + del page_kwargs["metadata"]["summary"] + settings["SUMMARY_MAX_PARAGRAPHS"] = 1 + settings["SUMMARY_MAX_LENGTH"] = None + page = Page(**page_kwargs) + self.assertEqual(page.summary, TEST_CONTENT) + + def test_summary_paragraph_max_length(self): + # If a :SUMMARY_MAX_PARAGRAPHS: and :SUMMARY_MAX_LENGTH: are set, the + # generated summary should not exceed the given paragraph count and + # not exceed the given length. + page_kwargs = self._copy_page_kwargs() + settings = get_settings() + page_kwargs["settings"] = settings + del page_kwargs["metadata"]["summary"] + settings["SUMMARY_MAX_PARAGRAPHS"] = 1 + settings["SUMMARY_MAX_LENGTH"] = 10 + page = Page(**page_kwargs) + self.assertEqual(page.summary, truncate_html_words(TEST_CONTENT, 10)) + def test_summary_end_suffix(self): # If a :SUMMARY_END_SUFFIX: is set, and there is no other summary, # generated summary should contain the specified marker at the end. diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index 22dd8e38a5..1ffe44407c 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -401,6 +401,23 @@ def test_truncate_html_words(self): self.assertEqual(utils.truncate_html_words("Ӓ text", 20), "Ӓ text") self.assertEqual(utils.truncate_html_words("઼ text", 20), "઼ text") + def test_truncate_html_paragraphs(self): + one = "

one

" + + self.assertEqual(utils.truncate_html_paragraphs(one, 0), "") + self.assertEqual(utils.truncate_html_paragraphs(one, 1), one) + self.assertEqual(utils.truncate_html_paragraphs(one, 2), one) + + two = one + "

two

" + self.assertEqual(utils.truncate_html_paragraphs(two, 1), one) + self.assertEqual(utils.truncate_html_paragraphs(two, 2), two) + + three = two + "

three

" + self.assertEqual(utils.truncate_html_paragraphs(three, 1), one) + self.assertEqual(utils.truncate_html_paragraphs(three, 2), two) + self.assertEqual(utils.truncate_html_paragraphs(three, 3), three) + self.assertEqual(utils.truncate_html_paragraphs(three, 4), three) + def test_process_translations(self): fr_articles = [] en_articles = [] diff --git a/pelican/utils.py b/pelican/utils.py index eda53d3f54..dfe05d216a 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -600,6 +600,25 @@ def truncate_html_words(s, num, end_text="…"): return out +def truncate_html_paragraphs(s, count): + """Truncates HTML to a certain number of paragraphs. + + :param count: number of paragraphs to keep + + Newlines in the HTML are preserved. + """ + paragraphs = [] + tag_stop = 0 + substr = s[:] + for i in range(count): + substr = substr[tag_stop:] + tag_start = substr.find("

") + tag_stop = substr.find("

") + len("

") + paragraphs.append(substr[tag_start:tag_stop]) + + return "".join(paragraphs) + + def process_translations(content_list, translation_id=None): """Finds translations and returns them.