Skip to content

Commit

Permalink
xmlwriter: fix issue with control chartacter in data elements
Browse files Browse the repository at this point in the history
Issue #978
  • Loading branch information
jmcnamara committed Nov 4, 2023
1 parent 0f10fb7 commit f5142fb
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 39 deletions.
25 changes: 2 additions & 23 deletions xlsxwriter/sharedstrings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,10 @@
# Copyright 2013-2023, John McNamara, [email protected]
#

# Standard packages.
import re

# Package imports.
from . import xmlwriter
from .utility import preserve_whitespace

# Compile performance critical regular expressions.
re_control_chars_1 = re.compile("(_x[0-9a-fA-F]{4}_)")
re_control_chars_2 = re.compile(r"([\x00-\x08\x0b-\x1f])")


class SharedStrings(xmlwriter.XMLwriter):
"""
Expand Down Expand Up @@ -92,22 +85,8 @@ def _write_si(self, string):
# Write the <si> element.
attributes = []

# Excel escapes control characters with _xHHHH_ and also escapes any
# literal strings of that type by encoding the leading underscore.
# So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_.
# The following substitutions deal with those cases.

# Escape the escape.
string = re_control_chars_1.sub(r"_x005F\1", string)

# Convert control character to the _xHHHH_ escape.
string = re_control_chars_2.sub(
lambda match: "_x%04X_" % ord(match.group(1)), string
)

# Escapes non characters in strings.
string = string.replace("\uFFFE", "_xFFFE_")
string = string.replace("\uFFFF", "_xFFFF_")
# Convert control character to a _xHHHH_ escape.
string = self._escape_control_characters(string)

# Add attribute to preserve leading or trailing whitespace.
if preserve_whitespace(string):
Expand Down
50 changes: 50 additions & 0 deletions xlsxwriter/test/comparison/test_escapes09.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
###############################################################################
#
# Tests for XlsxWriter.
#
# SPDX-License-Identifier: BSD-2-Clause
# Copyright (c), 2013-2023, John McNamara, [email protected]
#

from ..excel_comparison_test import ExcelComparisonTest
from ...workbook import Workbook


class TestCompareXLSXFiles(ExcelComparisonTest):
"""
Test file created by XlsxWriter against a file created by Excel.
"""

def setUp(self):
self.set_filename("escapes09.xlsx")

def test_create_file(self):
"""Test the creation of a simple XlsxWriter file."""

workbook = Workbook(self.got_filename)

worksheet = workbook.add_worksheet()
chart = workbook.add_chart({"type": "line"})

chart.axis_ids = [52721920, 53133312]

worksheet.write(0, 0, "Data\x1b[32m1")
worksheet.write(1, 0, "Data\x1b[32m2")
worksheet.write(2, 0, "Data\x1b[32m3")
worksheet.write(3, 0, "Data\x1b[32m4")

worksheet.write(0, 1, 10)
worksheet.write(1, 1, 20)
worksheet.write(2, 1, 10)
worksheet.write(3, 1, 30)

chart.add_series(
{"categories": "=Sheet1!$A$1:$A$4", "values": "=Sheet1!$B$1:$B$4"}
)

worksheet.insert_chart("E9", chart)

workbook.close()

self.assertExcelEqual()
Binary file not shown.
15 changes: 2 additions & 13 deletions xlsxwriter/worksheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,6 @@
from .exceptions import DuplicateTableName
from .exceptions import OverlappingRange

# Compile performance critical regular expressions.
re_control_chars_1 = re.compile("(_x[0-9a-fA-F]{4}_)")
re_control_chars_2 = re.compile(r"([\x00-\x08\x0b-\x1f])")

re_dynamic_function = re.compile(
r"""
\bANCHORARRAY\( |
Expand Down Expand Up @@ -6781,15 +6777,8 @@ def _write_cell(self, row, col, cell):
else:
# Write an optimized in-line string.

# Escape control characters. See SharedString.pm for details.
string = re_control_chars_1.sub(r"_x005F\1", string)
string = re_control_chars_2.sub(
lambda match: "_x%04X_" % ord(match.group(1)), string
)

# Escapes non characters in strings.
string = string.replace("\uFFFE", "_xFFFE_")
string = string.replace("\uFFFF", "_xFFFF_")
# Convert control character to a _xHHHH_ escape.
string = self._escape_control_characters(string)

# Write any rich strings without further tags.
if string.startswith("<r>") and string.endswith("</r>"):
Expand Down
34 changes: 31 additions & 3 deletions xlsxwriter/xmlwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
import re
from io import StringIO

# Compile performance critical regular expressions.
re_control_chars_1 = re.compile("(_x[0-9a-fA-F]{4}_)")
re_control_chars_2 = re.compile(r"([\x00-\x08\x0b-\x1f])")
xml_escapes = re.compile('["&<>\n]')


class XMLwriter(object):
"""
Expand All @@ -21,7 +26,6 @@ class XMLwriter(object):

def __init__(self):
self.fh = None
self.escapes = re.compile('["&<>\n]')
self.internal_fh = False

def _set_filehandle(self, filehandle):
Expand Down Expand Up @@ -94,6 +98,8 @@ def _xml_data_element(self, tag, data, attributes=[]):
tag += ' %s="%s"' % (key, value)

data = self._escape_data(data)
data = self._escape_control_characters(data)

self.fh.write("<%s>%s</%s>" % (tag, data, end_tag))

def _xml_string_element(self, index, attributes=[]):
Expand Down Expand Up @@ -178,7 +184,7 @@ def _xml_rich_inline_string(self, string, attributes=[]):
def _escape_attributes(self, attribute):
# Escape XML characters in attributes.
try:
if not self.escapes.search(attribute):
if not xml_escapes.search(attribute):
return attribute
except TypeError:
return attribute
Expand All @@ -197,10 +203,32 @@ def _escape_data(self, data):
# is different from _escape_attributes() in that double quotes
# are not escaped by Excel.
try:
if not self.escapes.search(data):
if not xml_escapes.search(data):
return data
except TypeError:
return data

data = data.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
return data

@staticmethod
def _escape_control_characters(data):
# Excel escapes control characters with _xHHHH_ and also escapes any
# literal strings of that type by encoding the leading underscore.
# So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_.
# The following substitutions deal with those cases.
try:
# Escape the escape.
data = re_control_chars_1.sub(r"_x005F\1", data)
except TypeError:
return data

# Convert control character to the _xHHHH_ escape.
data = re_control_chars_2.sub(
lambda match: "_x%04X_" % ord(match.group(1)), data
)

# Escapes non characters in strings.
data = data.replace("\uFFFE", "_xFFFE_").replace("\uFFFF", "_xFFFF_")

return data

0 comments on commit f5142fb

Please sign in to comment.