xmlwriter: fix issue with control chartacter in data elements

Issue #978
jmcnamara · Nov 4, 2023 · f5142fb · f5142fb
1 parent 0f10fb7
commit f5142fb
Show file tree

Hide file tree

Showing 5 changed files with 85 additions and 39 deletions.
diff --git a/xlsxwriter/sharedstrings.py b/xlsxwriter/sharedstrings.py
@@ -6,17 +6,10 @@
 # Copyright 2013-2023, John McNamara, [email protected]
 #
 
-# Standard packages.
-import re
-
 # Package imports.
 from . import xmlwriter
 from .utility import preserve_whitespace
 
-# Compile performance critical regular expressions.
-re_control_chars_1 = re.compile("(_x[0-9a-fA-F]{4}_)")
-re_control_chars_2 = re.compile(r"([\x00-\x08\x0b-\x1f])")
-
 
 class SharedStrings(xmlwriter.XMLwriter):
     """
@@ -92,22 +85,8 @@ def _write_si(self, string):
         # Write the <si> element.
         attributes = []
 
-        # Excel escapes control characters with _xHHHH_ and also escapes any
-        # literal strings of that type by encoding the leading underscore.
-        # So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_.
-        # The following substitutions deal with those cases.
-
-        # Escape the escape.
-        string = re_control_chars_1.sub(r"_x005F\1", string)
-
-        # Convert control character to the _xHHHH_ escape.
-        string = re_control_chars_2.sub(
-            lambda match: "_x%04X_" % ord(match.group(1)), string
-        )
-
-        # Escapes non characters in strings.
-        string = string.replace("\uFFFE", "_xFFFE_")
-        string = string.replace("\uFFFF", "_xFFFF_")
+        # Convert control character to a _xHHHH_ escape.
+        string = self._escape_control_characters(string)
 
         # Add attribute to preserve leading or trailing whitespace.
         if preserve_whitespace(string):

diff --git a/xlsxwriter/test/comparison/test_escapes09.py b/xlsxwriter/test/comparison/test_escapes09.py
@@ -0,0 +1,50 @@
+###############################################################################
+#
+# Tests for XlsxWriter.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+# Copyright (c), 2013-2023, John McNamara, [email protected]
+#
+
+from ..excel_comparison_test import ExcelComparisonTest
+from ...workbook import Workbook
+
+
+class TestCompareXLSXFiles(ExcelComparisonTest):
+    """
+    Test file created by XlsxWriter against a file created by Excel.
+
+    """
+
+    def setUp(self):
+        self.set_filename("escapes09.xlsx")
+
+    def test_create_file(self):
+        """Test the creation of a simple XlsxWriter file."""
+
+        workbook = Workbook(self.got_filename)
+
+        worksheet = workbook.add_worksheet()
+        chart = workbook.add_chart({"type": "line"})
+
+        chart.axis_ids = [52721920, 53133312]
+
+        worksheet.write(0, 0, "Data\x1b[32m1")
+        worksheet.write(1, 0, "Data\x1b[32m2")
+        worksheet.write(2, 0, "Data\x1b[32m3")
+        worksheet.write(3, 0, "Data\x1b[32m4")
+
+        worksheet.write(0, 1, 10)
+        worksheet.write(1, 1, 20)
+        worksheet.write(2, 1, 10)
+        worksheet.write(3, 1, 30)
+
+        chart.add_series(
+            {"categories": "=Sheet1!$A$1:$A$4", "values": "=Sheet1!$B$1:$B$4"}
+        )
+
+        worksheet.insert_chart("E9", chart)
+
+        workbook.close()
+
+        self.assertExcelEqual()
diff --git a/xlsxwriter/test/comparison/xlsx_files/escapes09.xlsx b/xlsxwriter/test/comparison/xlsx_files/escapes09.xlsx
diff --git a/xlsxwriter/worksheet.py b/xlsxwriter/worksheet.py
@@ -44,10 +44,6 @@
 from .exceptions import DuplicateTableName
 from .exceptions import OverlappingRange
 
-# Compile performance critical regular expressions.
-re_control_chars_1 = re.compile("(_x[0-9a-fA-F]{4}_)")
-re_control_chars_2 = re.compile(r"([\x00-\x08\x0b-\x1f])")
-
 re_dynamic_function = re.compile(
     r"""
     \bANCHORARRAY\(    |
@@ -6781,15 +6777,8 @@ def _write_cell(self, row, col, cell):
             else:
                 # Write an optimized in-line string.
 
-                # Escape control characters. See SharedString.pm for details.
-                string = re_control_chars_1.sub(r"_x005F\1", string)
-                string = re_control_chars_2.sub(
-                    lambda match: "_x%04X_" % ord(match.group(1)), string
-                )
-
-                # Escapes non characters in strings.
-                string = string.replace("\uFFFE", "_xFFFE_")
-                string = string.replace("\uFFFF", "_xFFFF_")
+                # Convert control character to a _xHHHH_ escape.
+                string = self._escape_control_characters(string)
 
                 # Write any rich strings without further tags.
                 if string.startswith("<r>") and string.endswith("</r>"):

diff --git a/xlsxwriter/xmlwriter.py b/xlsxwriter/xmlwriter.py
@@ -12,6 +12,11 @@
 import re
 from io import StringIO
 
+# Compile performance critical regular expressions.
+re_control_chars_1 = re.compile("(_x[0-9a-fA-F]{4}_)")
+re_control_chars_2 = re.compile(r"([\x00-\x08\x0b-\x1f])")
+xml_escapes = re.compile('["&<>\n]')
+
 
 class XMLwriter(object):
     """
@@ -21,7 +26,6 @@ class XMLwriter(object):
 
     def __init__(self):
         self.fh = None
-        self.escapes = re.compile('["&<>\n]')
         self.internal_fh = False
 
     def _set_filehandle(self, filehandle):
@@ -94,6 +98,8 @@ def _xml_data_element(self, tag, data, attributes=[]):
             tag += ' %s="%s"' % (key, value)
 
         data = self._escape_data(data)
+        data = self._escape_control_characters(data)
+
         self.fh.write("<%s>%s</%s>" % (tag, data, end_tag))
 
     def _xml_string_element(self, index, attributes=[]):
@@ -178,7 +184,7 @@ def _xml_rich_inline_string(self, string, attributes=[]):
     def _escape_attributes(self, attribute):
         # Escape XML characters in attributes.
         try:
-            if not self.escapes.search(attribute):
+            if not xml_escapes.search(attribute):
                 return attribute
         except TypeError:
             return attribute
@@ -197,10 +203,32 @@ def _escape_data(self, data):
         # is different from _escape_attributes() in that double quotes
         # are not escaped by Excel.
         try:
-            if not self.escapes.search(data):
+            if not xml_escapes.search(data):
                 return data
         except TypeError:
             return data
 
         data = data.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
         return data
+
+    @staticmethod
+    def _escape_control_characters(data):
+        # Excel escapes control characters with _xHHHH_ and also escapes any
+        # literal strings of that type by encoding the leading underscore.
+        # So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_.
+        # The following substitutions deal with those cases.
+        try:
+            # Escape the escape.
+            data = re_control_chars_1.sub(r"_x005F\1", data)
+        except TypeError:
+            return data
+
+        # Convert control character to the _xHHHH_ escape.
+        data = re_control_chars_2.sub(
+            lambda match: "_x%04X_" % ord(match.group(1)), data
+        )
+
+        # Escapes non characters in strings.
+        data = data.replace("\uFFFE", "_xFFFE_").replace("\uFFFF", "_xFFFF_")
+
+        return data