From 4278f7b7e0b64293646ac13d0c5dceb946927c5c Mon Sep 17 00:00:00 2001
From: Kesara Rathnayake <kesara@fq.nz>
Date: Thu, 22 Feb 2024 09:42:59 +1300
Subject: [PATCH] fix(text): Don't break URLs in annotation (#1104)

* refactor(text): Define MAX_WIDTH

* refactor(text): Define SPLITTER_WIDTH

* fix(text): Don't break URLs in annotation

Fixes #1101

* test: Add unit test for annotations

Add unit tests for annotations in xml2rfc.TextWriter.render_reference
method.
---
 test.py                 | 62 +++++++++++++++++++++++++++++++++++++++++
 xml2rfc/writers/text.py | 24 +++++++++-------
 2 files changed, 76 insertions(+), 10 deletions(-)

diff --git a/test.py b/test.py
index 2d1afc00..f486424a 100644
--- a/test.py
+++ b/test.py
@@ -11,6 +11,7 @@
 from xml2rfc.boilerplate_rfc_7841 import boilerplate_rfc_status_of_memo
 from xml2rfc.walkpdf import xmldoc
 from xml2rfc.writers.base import default_options
+from xml2rfc.writers.text import MAX_WIDTH
 
 try:
     from xml2rfc import debug
@@ -635,6 +636,67 @@ def test_boilerplate_insert_status_of_memo_editorial(self):
         target = 'https://www.rfc-editor.org/info/rfc9280'
         self.assertEqual(target, rfc.xpath('./section/t/eref')[0].get('target'))
 
+class TextWriterTest(unittest.TestCase):
+    '''TextWriter tests'''
+
+    def setUp(self):
+        xml2rfc.log.quiet = True
+        path = 'tests/input/elements.xml'
+        self.parser = xml2rfc.XmlRfcParser(path,
+                                           quiet=True,
+                                           options=default_options,
+                                           **options_for_xmlrfcparser)
+        self.xmlrfc = self.parser.parse()
+        self.writer = xml2rfc.TextWriter(self.xmlrfc, quiet=True)
+
+    def test_render_reference(self):
+        # test annotations
+        reference = '''
+<references>
+  <reference anchor="REFTEST">
+    <front>
+      <title>Reference Test</title>
+      <author initials="J." surname="Doe" fullname="Jane Doe">
+        <organization>ACMI Corp.</organization>
+      </author>
+      <date month="February" year="2024"/>
+    </front>
+    <annotation>{annotation}</annotation>
+  </reference>
+</references>'''
+        self.writer.refname_mapping['REFTEST'] = 'REFTEST'
+
+        # single line annotation
+        annotation = 'foobar'
+        references = lxml.etree.fromstring(reference.format(annotation=annotation))
+        lines = self.writer.render_reference(references.getchildren()[0], width=60)
+        self.assertEqual(len(lines), 1)
+        self.assertIn(annotation, lines[0].text)
+
+        # multi line annotation
+        annotation = '''Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa.
+               Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim.'''
+        references = lxml.etree.fromstring(reference.format(annotation=annotation))
+        lines = self.writer.render_reference(references.getchildren()[0], width=60)
+        self.assertGreater(len(lines), 1)
+        self.assertIn(annotation[:5], lines[0].text)
+
+        # single line annotation (larger than width and smaller than MAX_WIDTH)
+        annotation = 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commo'
+        references = lxml.etree.fromstring(reference.format(annotation=annotation))
+        lines = self.writer.render_reference(references.getchildren()[0], width=len(annotation)-5)
+        self.assertGreater(MAX_WIDTH, len(annotation))
+        self.assertGreater(len(lines), 1)
+        self.assertIn(annotation[:5], lines[0].text)
+
+        # annotation with URL
+        url = 'https://example.org'
+        annotation = f'Example: <eref target="{url}" />'
+        references = lxml.etree.fromstring(reference.format(annotation=annotation))
+        lines = self.writer.render_reference(references.getchildren()[0], width=60)
+        self.assertEqual(len(lines), 2)
+        self.assertIn(url, lines[1].text)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/xml2rfc/writers/text.py b/xml2rfc/writers/text.py
index 3d6fb45e..f83d9633 100644
--- a/xml2rfc/writers/text.py
+++ b/xml2rfc/writers/text.py
@@ -36,6 +36,9 @@
 from xml2rfc.utils import justify_inline, clean_text
 
 
+MAX_WIDTH = 72
+SPLITTER_WIDTH = 67
+
 IndexItem   = namedtuple('indexitem', ['item', 'subitem', 'anchor', 'page', ])
 Joiner      = namedtuple('joiner', ['join', 'indent', 'hang', 'overlap', 'do_outdent'])
 # Joiner parts:
@@ -80,8 +83,8 @@ def __init__(self, elem, prev, next=None, beg=None, end=None):
         self.beg  = beg                 # beginning line of block
         self.end  = end                 # ending line of block
 
-wrapper = utils.TextWrapper(width=72)
-splitter = utils.TextSplitter(width=67)
+wrapper = utils.TextWrapper(width=MAX_WIDTH)
+splitter = utils.TextSplitter(width=SPLITTER_WIDTH)
 seen = set()
 
 # This is not a complete list of whitespace characters, and isn't intended to be.  It's
@@ -299,7 +302,7 @@ def process(self):
             joiners = base_joiners
             if self.options.pagination:
                 self.add_pageno_placeholders()
-            lines = self.render(self.root, width=72, joiners=joiners)
+            lines = self.render(self.root, width=MAX_WIDTH, joiners=joiners)
 
             if self.options.pagination:
                 lines = findblocks(lines)
@@ -321,8 +324,8 @@ def process(self):
                             sys.stderr.write(("%3d %10s         [%4s] %s\n" % (i, tag,                           page, l.text)))
             for i, l in enumerate(lines):
                 length = len(l.text)
-                if length > 72:
-                    self.warn(l.elem, "Too long line found (L%s), %s characters longer than 72 characters: \n%s" %(i+1, length-72, l.text))
+                if length > MAX_WIDTH:
+                    self.warn(l.elem, f"Too long line found (L{i + 1}), {length - MAX_WIDTH} characters longer than {MAX_WIDTH} characters: \n{l.text}")
 
             text = ('\n'.join( l.text for l in lines )).rstrip(stripspace) + '\n'
 
@@ -505,7 +508,7 @@ def update_toc(self, lines):
             else:
                 pass
         # new toc, to be used to replace the old one
-        toclines = self.render(toc, width=72, joiners=base_joiners)
+        toclines = self.render(toc, width=MAX_WIDTH, joiners=base_joiners)
         if toc_start and toc_end:
             j = 2
             for i in range(toc_start+2, toc_end):
@@ -1811,7 +1814,7 @@ def join_cols(left, right):
                 textwidth_l = textwidth(l)
                 textwidth_r = textwidth(r)
                 #assert textwidth_l+textwidth_r< 70
-                w = 72-textwidth_l-textwidth_r
+                w = MAX_WIDTH-textwidth_l-textwidth_r
                 lines.append(l+' '*w+r)
             return '\n'.join(lines).rstrip(stripspace)+'\n'
         #
@@ -1819,7 +1822,7 @@ def wrap(label, items, left, right, suffix=''):
             line = '%s%s%s' % (label, items, suffix)
             ll = len(left)
             lr = len(right)
-            width = 48 if ll >= lr else min(48, 72-4-len(right[ll]))
+            width = 48 if ll >= lr else min(48, MAX_WIDTH-4-len(right[ll]))
             wrapper = textwrap.TextWrapper(width=width, subsequent_indent=' '*len(label))
             return wrapper.wrap(line)
         #
@@ -2746,7 +2749,8 @@ def render_reference(self, e, width, **kwargs):
         text += '.'
         for ctag in ('annotation', ):
             for c in e.iterdescendants(ctag):
-                text = self.tjoin(text, c, width, **kwargs)
+                # use MAX_WIDTH here since text gets formatted later
+                text = self.tjoin(text, c, MAX_WIDTH, keep_url=True, **kwargs)
         text = fill(text, width=width, fix_sentence_endings=False, keep_url=True, **kwargs).lstrip(stripspace)
         
         text = indent(text, 11, 0)
@@ -3957,7 +3961,7 @@ def find_minwidths(e, cells, hyphen_split=False):
             Find the minimum column widths of regular cells
             """
             i = 0
-            splitter = utils.TextSplitter(width=67, hyphen_split=hyphen_split)
+            splitter = utils.TextSplitter(width=SPLITTER_WIDTH, hyphen_split=hyphen_split)
             for p in e.iterchildren(['thead', 'tbody', 'tfoot']):
                 for r in list(p.iterchildren('tr')):
                     j = 0