From c00088014bf83e5aec835343acf69231bcbdafcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20F=C3=A4=C3=9Fler?= Date: Fri, 14 Jan 2022 19:40:00 +0100 Subject: [PATCH] Fixing a bug when multi-byte characters were split (#75) --- .../fasterxml/aalto/out/ByteXmlWriter.java | 20 ++++--- .../fasterxml/aalto/sax/TestSaxWriter.java | 57 +++++++++++++++++++ 2 files changed, 69 insertions(+), 8 deletions(-) create mode 100644 src/test/java/com/fasterxml/aalto/sax/TestSaxWriter.java diff --git a/src/main/java/com/fasterxml/aalto/out/ByteXmlWriter.java b/src/main/java/com/fasterxml/aalto/out/ByteXmlWriter.java index dbdf82c..f8ce09f 100644 --- a/src/main/java/com/fasterxml/aalto/out/ByteXmlWriter.java +++ b/src/main/java/com/fasterxml/aalto/out/ByteXmlWriter.java @@ -390,7 +390,7 @@ public final void writeStartTagEnd() flushBuffer(); } _outputBuffer[_outputPtr++] = BYTE_GT; - } + } @Override public void writeStartTagEmptyEnd() @@ -435,7 +435,7 @@ public final void writeEndTag(WName name) ptr += name.appendBytes(bbuf, ptr); bbuf[ptr++] = BYTE_GT; _outputPtr = ptr; - } + } /* /********************************************************************** @@ -572,6 +572,8 @@ protected final void writeAttrValue(char[] vbuf, int offset, int len) { if (_surrogate != 0) { outputSurrogates(_surrogate, vbuf[offset]); +// reset the temporary surrogate storage + _surrogate = 0; ++offset; --len; } @@ -785,7 +787,7 @@ public int writeCData(char[] cbuf, int offset, int len) writeCDataEnd(); // will check surrogates } return ix; - } + } protected int writeCDataContents(char[] cbuf, int offset, int len) throws IOException, XMLStreamException @@ -865,7 +867,7 @@ protected int writeCDataContents(char[] cbuf, int offset, int len) } } return -1; - } + } @Override public final void writeCharacters(String text) @@ -908,6 +910,8 @@ public final void writeCharacters(char[] cbuf, int offset, int len) { if (_surrogate != 0) { outputSurrogates(_surrogate, cbuf[offset]); +// reset the temporary surrogate storage + _surrogate = 0; ++offset; --len; } @@ -1088,7 +1092,7 @@ private final void writeSplitCharacters(char[] cbuf, int offset, int len) } _outputBuffer[_outputPtr++] = (byte)ch; } - } + } /* /********************************************************************** @@ -1439,7 +1443,7 @@ public void writeXmlDeclaration(String version, String encoding, String standalo // !!! TBI: check validity writeRaw(version, 0, version.length()); writeRaw(BYTE_APOS); - + if (encoding != null && encoding.length() > 0) { writeRaw(BYTES_XMLDECL_ENCODING); // !!! TBI: check validity @@ -1453,7 +1457,7 @@ public void writeXmlDeclaration(String version, String encoding, String standalo writeRaw(BYTE_APOS); } writeRaw(BYTE_QMARK, BYTE_GT); - } + } /* /********************************************************************** @@ -1594,7 +1598,7 @@ protected final void flushBuffer() protected final void writeAsEntity(int c) throws IOException { - // Quickie check to avoid + // Quickie check to avoid byte[] buf = _outputBuffer; int ptr = _outputPtr; diff --git a/src/test/java/com/fasterxml/aalto/sax/TestSaxWriter.java b/src/test/java/com/fasterxml/aalto/sax/TestSaxWriter.java new file mode 100644 index 0000000..8c80242 --- /dev/null +++ b/src/test/java/com/fasterxml/aalto/sax/TestSaxWriter.java @@ -0,0 +1,57 @@ +package com.fasterxml.aalto.sax; + +import com.fasterxml.aalto.out.Utf8XmlWriter; +import com.fasterxml.aalto.out.WriterConfig; + +import java.io.ByteArrayOutputStream; + +public class TestSaxWriter extends base.BaseTestCase { + + public void testSurrogateMemory1() throws Exception { + // This test aims to produce the + // javax.xml.stream.XMLStreamException: Incomplete surrogate pair in content: first char 0xd835, second 0x78 + // error message. Before fixing the respective issue, it was provoked by a multi-byte character + // where the first byte was exactly at the end of the internal reading buffer and enough further data + // to also fill the next two internal reading buffers. Then, the code would try to fuse the first byte + // of the original multi-byte character with the first character in the third buffer because + // ByteXmlWriter#_surrogate was not set back to 0 after writing the original multi-byte character. + StringBuilder testText = new StringBuilder(); + for (int i = 0; i < 511; i++) + testText.append('x'); + testText.append("\uD835\uDFCE"); + for (int i = 0; i < 512; i++) + testText.append('x'); + + WriterConfig writerConfig = new WriterConfig(); + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + Utf8XmlWriter writer = new Utf8XmlWriter(writerConfig, byteArrayOutputStream); + writer.writeStartTagStart(writer.constructName("testelement")); + writer.writeAttribute(writer.constructName("testattr"), testText.toString()); + writer.writeStartTagEnd(); + writer.writeEndTag(writer.constructName("testelement")); + writer.close(false); + + } + + public void testSurrogateMemory2() throws Exception { + // This test aims to produce the + // java.io.IOException: Unpaired surrogate character (0xd835) + // error message. Before fixing the respective issue, it was provoked by a multi-byte character + // where the first byte was exactly at the end of the internal reading buffer and the next + // reading buffer was enough to write all the remaining data. Then, by the missing reset of + // ByteXmlWriter#_surrogate, the code expected another multi-byte surrogate that never came. + StringBuilder testText = new StringBuilder(); + for (int i = 0; i < 511; i++) + testText.append('x'); + testText.append("\uD835\uDFCE"); + + WriterConfig writerConfig = new WriterConfig(); + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + Utf8XmlWriter writer = new Utf8XmlWriter(writerConfig, byteArrayOutputStream); + writer.writeStartTagStart(writer.constructName("testelement")); + writer.writeAttribute(writer.constructName("testattr"), testText.toString()); + writer.writeStartTagEnd(); + writer.writeEndTag(writer.constructName("testelement")); + writer.close(false); + } +}