From f5142fb92ba3c2317c697ef1d74e56cfbb2cb15f Mon Sep 17 00:00:00 2001 From: John McNamara Date: Sat, 4 Nov 2023 00:48:18 +0000 Subject: [PATCH] xmlwriter: fix issue with control chartacter in data elements Issue #978 --- xlsxwriter/sharedstrings.py | 25 +-------- xlsxwriter/test/comparison/test_escapes09.py | 50 ++++++++++++++++++ .../test/comparison/xlsx_files/escapes09.xlsx | Bin 0 -> 8102 bytes xlsxwriter/worksheet.py | 15 +----- xlsxwriter/xmlwriter.py | 34 ++++++++++-- 5 files changed, 85 insertions(+), 39 deletions(-) create mode 100644 xlsxwriter/test/comparison/test_escapes09.py create mode 100644 xlsxwriter/test/comparison/xlsx_files/escapes09.xlsx diff --git a/xlsxwriter/sharedstrings.py b/xlsxwriter/sharedstrings.py index fc881efb2..167d1683b 100644 --- a/xlsxwriter/sharedstrings.py +++ b/xlsxwriter/sharedstrings.py @@ -6,17 +6,10 @@ # Copyright 2013-2023, John McNamara, jmcnamara@cpan.org # -# Standard packages. -import re - # Package imports. from . import xmlwriter from .utility import preserve_whitespace -# Compile performance critical regular expressions. -re_control_chars_1 = re.compile("(_x[0-9a-fA-F]{4}_)") -re_control_chars_2 = re.compile(r"([\x00-\x08\x0b-\x1f])") - class SharedStrings(xmlwriter.XMLwriter): """ @@ -92,22 +85,8 @@ def _write_si(self, string): # Write the element. attributes = [] - # Excel escapes control characters with _xHHHH_ and also escapes any - # literal strings of that type by encoding the leading underscore. - # So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_. - # The following substitutions deal with those cases. - - # Escape the escape. - string = re_control_chars_1.sub(r"_x005F\1", string) - - # Convert control character to the _xHHHH_ escape. - string = re_control_chars_2.sub( - lambda match: "_x%04X_" % ord(match.group(1)), string - ) - - # Escapes non characters in strings. - string = string.replace("\uFFFE", "_xFFFE_") - string = string.replace("\uFFFF", "_xFFFF_") + # Convert control character to a _xHHHH_ escape. + string = self._escape_control_characters(string) # Add attribute to preserve leading or trailing whitespace. if preserve_whitespace(string): diff --git a/xlsxwriter/test/comparison/test_escapes09.py b/xlsxwriter/test/comparison/test_escapes09.py new file mode 100644 index 000000000..633ae3ee3 --- /dev/null +++ b/xlsxwriter/test/comparison/test_escapes09.py @@ -0,0 +1,50 @@ +############################################################################### +# +# Tests for XlsxWriter. +# +# SPDX-License-Identifier: BSD-2-Clause +# Copyright (c), 2013-2023, John McNamara, jmcnamara@cpan.org +# + +from ..excel_comparison_test import ExcelComparisonTest +from ...workbook import Workbook + + +class TestCompareXLSXFiles(ExcelComparisonTest): + """ + Test file created by XlsxWriter against a file created by Excel. + + """ + + def setUp(self): + self.set_filename("escapes09.xlsx") + + def test_create_file(self): + """Test the creation of a simple XlsxWriter file.""" + + workbook = Workbook(self.got_filename) + + worksheet = workbook.add_worksheet() + chart = workbook.add_chart({"type": "line"}) + + chart.axis_ids = [52721920, 53133312] + + worksheet.write(0, 0, "Data\x1b[32m1") + worksheet.write(1, 0, "Data\x1b[32m2") + worksheet.write(2, 0, "Data\x1b[32m3") + worksheet.write(3, 0, "Data\x1b[32m4") + + worksheet.write(0, 1, 10) + worksheet.write(1, 1, 20) + worksheet.write(2, 1, 10) + worksheet.write(3, 1, 30) + + chart.add_series( + {"categories": "=Sheet1!$A$1:$A$4", "values": "=Sheet1!$B$1:$B$4"} + ) + + worksheet.insert_chart("E9", chart) + + workbook.close() + + self.assertExcelEqual() diff --git a/xlsxwriter/test/comparison/xlsx_files/escapes09.xlsx b/xlsxwriter/test/comparison/xlsx_files/escapes09.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..c8895578010de2e76a906b4d1d4f04629b651ca3 GIT binary patch literal 8102 zcma)BWmuG3+Z`o`WJZ=@bQ}k(81y=}zg9kQhMa z8;8vSX}`C0x$po5&#uI zPtw8O)!g3IP~FSX+{J*y)6P~y9SA@JX-S&{^7}z^HB7CWw25EL}G{6Hz=0O?L0Y2mNH|mMwfv@Y_f-U6J#}s zl!a2ujqe%(wKw9czPy0;&3$~3Xhf$(#c<0Ub$m$J9c-O&_( zy@4oF7$b9k>+yJIUO#faS-Kl@lQ)QBE;Ur-nX>j(DZ%4d53Q&gKF32qq&1en$7(!8 z<@q3GD#5~3V0*Po_$#H<$C^Mp&V_p>D-Sn(HW=I(dazpD!MK)t$X127ZpK`%aJon|y*)v3pJ0epJ*y7)fDWeOS$7Sz zi`v82Vw_Dj3|&{t&q~{TL17wo-D%j6;p!K z0MvjI-{#G{u4CH`ukYcle(yDu`&!646+?TXzTd0ZuU%^`nB~qDX114n3t^S>W!{O5 zqJ}=4+<{@TB3+qO2d^LKfnDR@EexyLR+hb$YcA8gF1|YuRD(9kX_uoQ#Vn}np4rub z*XOM}?QhPBlhwhjQ;@9^HPPkHf@z1Q8b2V?Xo4ojgq)yS84(&?Envfx1SLU4{OO^g zHN0k5Xf`9m4hWY_@m@sZ=BvB$=rR4vHQ2`ZNuzx zZmVx#esGJ%cS@?sMtpk(d3Rska^P6p_e81S21j_iu+qHa+FJTThJ%g|;8zjXho0l` zqD0iViW2g&h^7wC=6_N#5^gBp%0=2fkI^LA<(*n_EiDTnW=^;y<|w(xPVb@j_Cykd zI70N*7Ee18ip;%m@jafAFWS?s>>}jnj4V+}Cysx%3Xy4eJ@oXh>;tAfKkH`QoGb|C zuGNgbp~j4=JypG+X)^{;B$OV(T+OE&XvF*>{f!uRhIDY$QVV~(1lue~??clB$WgqL zU6bg!k>4UaS0x%TBG#W!<_s}KeTzTnI_dg?Q~D?5I{rd@)5IX_9Yu!L4=vJL;#4y> zoLHvgZaP{=W6wo9Xoi!jeMZ$c5ykwO`JflhSZrkD>d13woq z$Q|!WLyK|T+o81U9Q)#4oHQ90BqOA`OvA!TvI4BweLKwW&Ss+SZ;4(V_33O1ZO3Pa z+-XD0+;GY84OqSLll=?dQ+}%0yoqu_1r+SSm2IJc4#EGXh{B+JID0N59VjC z0n0X;#Lwd0t@qWQ@$2?)UTpKjqhe~!f7hrhIuxK9<}BLw8i&CQtwE-KTyQ#Fcr}bJ zaDUi&pXpPw;LV|^iP%S@hsBDbX0e~S~iR=nX5(kP`Q8y54S>bR-)@KL z&zf}1_8t^&rYv>}ETWP_u^3zt<}rHzfxbBt3UV@Rt_)8HvXDK?hR zU5z&8JDIw7RTq!oA$yI!pKhLrFdKylozv$&P?3O);2{f)qrbJP6p8Kg2N%``JuO!- zYK`u8=$y(`JeDg5v$vQ%pOT}R8BopKu87lpo)rMj8BV{Gxbi-Tcqez4oiEJO2hor@ zQ{5QgrP3xYe3~aR=TM{&y*P)axSk_n5a-+CP79o%F)`@q%IGL0g zK+%4cAtUar#s?(>E6VuEe#+otWp3{3!ui)5_aF6uKU7*4!wcGxIirEN%qzkZF{EXQ zuxa^U2P8(Hft}#QUHtoa_-8v#`4yB2{zn!~i8Q_zHyogM?uzwi_LQf5*yRaT#Pl>RtVr)I(^JMi(#KuEAbH(mzV$7JAreBfH}W;EQC{SH!9CcP9wwFK|<}Ao~7{6X6<~z z6F1-bzNF&uXi4R`G=v`q$MN2}wU$$gW5y@?M`hgtYV)bLyGMe;6KG0b@fYri>To|E zkdNAjlXHqJFo-{FTwsshPvm{%{9eyzdVZ71fp3|e(DPOClTNjA1AxDw?;tSC+Ce$(@ZC;e3QS9c;ob!30*#v%uLa9X&CFa{eIL<1&MD>$Tb zt?%7-Wj#a-GCO^L7byGmV5bjKy3q2e*Sy~`d{ZEDTTujWOs3NdGa{nw30GQxyvx+K zUtkDi*$Q9wIkT8XiB@|uBw~Z<0EWxP%qX*N>WW3{ziHj_zL+i!K6VN;rGVa{Dq?k~ zuhOX^`9A&3*|JWXxraqBHoD9tua+wmwp zs{!y~x7GSY&sZ=5t?fA^grcNM_S`3*+fI;&3>xb_FzQm@vGsVi8|S6~S(b)>tIjiG zErm4+1j*NLl~7$1Gd2x@;)*4ph!bplW4X1ckJerRvy+&Kqz>GiSLvn@V$6^<((Brn z@aj4ReI-ZW{Vu<|QM#J+IQ|&ie2yupaF^7>YuhL5H*MiKfx}<=Y;*3_7d^Vh%6FiA zMfZq!SVPb-mX_-6K(-7FcgB1TXPwdD$ngfjgjMyx=?aVKp#}$D!C^Y=ga;`J4$Idt zyspBPC%lgE8_P4&%Y@U(lCPtC>y7N$sn2*>PINe7$*cgQxKvAG79PqrXA{NsL`}om zY!peVF!%|rUWZ?;1iJIOZEAiruCEL^yhW+ZBeL>F6o>aQv*5EDQ+n<3V4L*zwNAnt zb*Wt>Uv0(=j{Wx(%3&+rM{Ms|AJ1y><`v>Yh&)PKy?xwMW?Y5)8>E+$4hhwO9kX^c zwqN;9%#Z~egg-^SD02d-gZXk)nKS$>^Z$zaqsT8y`#+R<=|!1iE*R|IY(X#>M0tpwd+3h}uZGStw+-n4891NCYA8mJ$<(+O^igXo zSn(Up)V|&hs?pm8Q9f zd@?G|Y0Et^*03Uqv!&Uw!N9hP7V0Xj6~?5I)>67@<_%v_A@`5*`|{KBrt{F_lh0q$ zyq%}w*|h!C)pxnW2qZ`WO41N7Ziq+qjWa{#OzNezvI(RxW{}Rb^4Wcw*ZJdChGLTNI*gIz)%2YtF}}B$fS(NN zpDNXNx}>zcli32*xYL8kh9Flea}y*;)Trp{u{8rCiFd!%3ADtOnA8Dv!*uRZ1)VIseQdyi?xJX4(sH^-6azof z&ih2ziD1d-O#%p>R@|Enhpes*Vy;lTLL#K>*iRKk^o3R>J#6#P;!^usK5NFgWOqSH zf`+!gdw`N+8FVYxw4Q`Sr+0L4E-tU_b;%=FyuL2pr6$B}qm*^zgufj1@ih2s#aFJ| zhp9`h{3c;5hc~u)1;j7YqC4^7RPlb!E<48Z4dT62sx&#-nw*||vU+A3Z*j-@=OwT7 zfw8Q9;CUWP{;=rfbsMVB6LCzkaHgZQcYJt|W9u&zk#eCxrWsOY-scEmK{MGCK5JnF zKqh5JLZGvy)G9?ppIzHwb#Ps8F<1*HFTc;eWDY|lXhiqx)s38OdwHxXBQ;HAuPVFu z*28qx-Er!`ETwABnF$Zl3ArWFYeuuMy$7B0vb?NKEscJhayM;jT_c`;NhvTG;R$5z zrn$8x|FkJgPYW3qR&0ZLO8j1{!jaMtegjL?OD7AEZl?*ANL!!n2@lrp*^Mir>>q=L zx3-S!sg|r|R;sAzw1PssJUIgfZ_U=jua<l?v`9h=VtnpLs6gfN(d5Kcq$BWNj{cf$E?!g86Cp@3qX5ZSh|QsOz>2hLvi zw4R@}pD24zy9@CRg7f}^4a|VS_vTSXlN_;ZmgX3px^n;%%~qX~dG1ilH1HWZ9hc_5 z)QB%<*u;zXFFkj>#f~aO_@(Rd4T#SGavkw2wx!dy)Hnrr-L3 zL(!UH{zjMV%1z3t!v35x;7KeuWrW@PscLZ6JOcmhBM#Nf+)!Kz{eWQF_tupl3trQG z#=SSz$=~L+ova;BaG$WaN*b!it{QY@Y{*Vtqm6=6OW2|J$j(obyV%J$@)>H%P^pp{<&R_u zH8QwbncJEFc>U>d|B_Q6bRI@eOOZ9w?a@FQtD{JS`btuY?@s0hx=)YV>y9k9B~J`T z+#l;TtA3Yt6cP}?6oL!7M}S>8BB$T|Mu*A&_95r)DP5N*@0`W7G-;&(efr)PA7A9n zwf%%MV##1BneOGPPx`NQLfS-VhWM@YsbHMhQ z(aUmww=?zv-nEy0`#l29*rLTvX41<89Sk7Z-8^_CI}TG;xW)%~6RZNP=@Hm+GzPCdVUfRQNr7dt zXi~I%#4Q5u`kan!=a3@Cs7BIbFC+K(Gk3^c0vQETxt%>AEoNC=eB#$yd1x~O4XbNX zHbFil32$9FIkaR5dx?ja89R&P#NwD>)6v4DNs^#Ph6%e@fepy|ZjU9Mg^HZ-1MzU| zy|*|I<*0iD44rG#i8h{bMLnUrofo>A_&i~xAu5Z6ytT2%LWP`}H0^oJJ+3uD=U}B; z+9*&?KbXL6`+@US=CC1ZEeM0>JBg2cZC5%H78@l)1@(01^Tn+cV-EV&Ne${oeCa4M z#v;fA?Y}mX4=hgaTJa>bbzRpFJfpcgcT+g>7<=XPtnsXIr!gE_U6eSEsOrJA*>Z9E zz*$5E?}#r&<1<|e>eVer&wBu!=)F!pWWuNtv)Qo3>c1U8h*W&O>4?Umou^>0Q1uBzc0npMyJ}=$xX2Y9&^>(8htdJ zYX_cFm=YBK*#7R?QoT!vsvU#e(PM$KntF)`nF(6QY%9yTc9F(#{~o3*K0J0}&oP{) z_ce^WS;J3qn^!AbA17cYNzqFb-DGQh!BWTP>jvUHeB*7u8T^I&{b1zSaQ2Y%cS-i` zxGaWc?mdn6erKJTWi{zGaehGw$-CFi$PV1UY*VNJ2O>_N(q>A`-rnm@FjQ~u(gI7n z86^hPtjn^q9|;K8Jz9?`1|M$Y%o+~Ie-KDnYN*T4O~y5~F^#`jXoN>ZmM1V(RijKB z%n=_jvV%)T#iNsX!@o>MLxK&u_+nFzAPLI47JKjAE%h_7n7>4UTGlNyrEBhu{>f_N z8E;%5(iu91tZ`pGWC(zixueqxh-zj29mYU9o|&%9$^1dloD&_Ehxl>FFPj`HV0+BE>dtfbChHR=se*dPD)8Nk?sC-9)QRZ+;!_xVsz2-2QLI#Uj%Hms6LnC$F}~X)xVq@{jt?A zhBrS(g-235f!|>h-t0o`dGVd^0whZ8POnJl>M>T)S4`)AKMM=UCkPmXDVadFdttMm zHoe!|sY%?LSqsBJ=ub#&*Yq$lrmKT*5VcL|&03gl!m%wi&KaT8i`WgXXO&)3$T7;QS4| z64(jF;Fkm(FKp`v1hFoKitJ_n<;pw-0;w2R(?Y$KeQNxfoIzxs1$YL3Cq7Q$IInx3#54Xj+ylv**_@ z%akJC`X37$-#YfLWR~DD<#HUy+7oa%yonyj)V_LX+#5j97`wPmEbsVmeWRb7vLZnN&&R$ovxW#hR^gUA8ufaA(lYP5elmm)(^0RW)50Oa~zeEP>z zDvTeeRDYY5E(2dM`|H`sW#Gz-CiJ(H6cp+|oTB_4_17@}GU^jlTl-(spV9uGVSi7l zFT*0y0l#L}|J&XFn&Vyuyn_k&J=Oj5zr3LI*HrQ{XbEb-{ZCZ<4;kg3(SOg#E~EPZ z|Bn7IiP`_g&9BkZMuZhT?Z{zoH;qryt z@c+8dKST{EI4=#SU>wo*_7^tNO01%)ar>Iq(NOAG){{T<{E0X{K literal 0 HcmV?d00001 diff --git a/xlsxwriter/worksheet.py b/xlsxwriter/worksheet.py index e5a17022b..31453c77b 100644 --- a/xlsxwriter/worksheet.py +++ b/xlsxwriter/worksheet.py @@ -44,10 +44,6 @@ from .exceptions import DuplicateTableName from .exceptions import OverlappingRange -# Compile performance critical regular expressions. -re_control_chars_1 = re.compile("(_x[0-9a-fA-F]{4}_)") -re_control_chars_2 = re.compile(r"([\x00-\x08\x0b-\x1f])") - re_dynamic_function = re.compile( r""" \bANCHORARRAY\( | @@ -6781,15 +6777,8 @@ def _write_cell(self, row, col, cell): else: # Write an optimized in-line string. - # Escape control characters. See SharedString.pm for details. - string = re_control_chars_1.sub(r"_x005F\1", string) - string = re_control_chars_2.sub( - lambda match: "_x%04X_" % ord(match.group(1)), string - ) - - # Escapes non characters in strings. - string = string.replace("\uFFFE", "_xFFFE_") - string = string.replace("\uFFFF", "_xFFFF_") + # Convert control character to a _xHHHH_ escape. + string = self._escape_control_characters(string) # Write any rich strings without further tags. if string.startswith("") and string.endswith(""): diff --git a/xlsxwriter/xmlwriter.py b/xlsxwriter/xmlwriter.py index 28b8a0f9c..d0c480428 100644 --- a/xlsxwriter/xmlwriter.py +++ b/xlsxwriter/xmlwriter.py @@ -12,6 +12,11 @@ import re from io import StringIO +# Compile performance critical regular expressions. +re_control_chars_1 = re.compile("(_x[0-9a-fA-F]{4}_)") +re_control_chars_2 = re.compile(r"([\x00-\x08\x0b-\x1f])") +xml_escapes = re.compile('["&<>\n]') + class XMLwriter(object): """ @@ -21,7 +26,6 @@ class XMLwriter(object): def __init__(self): self.fh = None - self.escapes = re.compile('["&<>\n]') self.internal_fh = False def _set_filehandle(self, filehandle): @@ -94,6 +98,8 @@ def _xml_data_element(self, tag, data, attributes=[]): tag += ' %s="%s"' % (key, value) data = self._escape_data(data) + data = self._escape_control_characters(data) + self.fh.write("<%s>%s" % (tag, data, end_tag)) def _xml_string_element(self, index, attributes=[]): @@ -178,7 +184,7 @@ def _xml_rich_inline_string(self, string, attributes=[]): def _escape_attributes(self, attribute): # Escape XML characters in attributes. try: - if not self.escapes.search(attribute): + if not xml_escapes.search(attribute): return attribute except TypeError: return attribute @@ -197,10 +203,32 @@ def _escape_data(self, data): # is different from _escape_attributes() in that double quotes # are not escaped by Excel. try: - if not self.escapes.search(data): + if not xml_escapes.search(data): return data except TypeError: return data data = data.replace("&", "&").replace("<", "<").replace(">", ">") return data + + @staticmethod + def _escape_control_characters(data): + # Excel escapes control characters with _xHHHH_ and also escapes any + # literal strings of that type by encoding the leading underscore. + # So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_. + # The following substitutions deal with those cases. + try: + # Escape the escape. + data = re_control_chars_1.sub(r"_x005F\1", data) + except TypeError: + return data + + # Convert control character to the _xHHHH_ escape. + data = re_control_chars_2.sub( + lambda match: "_x%04X_" % ord(match.group(1)), data + ) + + # Escapes non characters in strings. + data = data.replace("\uFFFE", "_xFFFE_").replace("\uFFFF", "_xFFFF_") + + return data