From 98303bbda342433ef3735b13aa902cef4499c5e7 Mon Sep 17 00:00:00 2001 From: Kevin Saul Date: Sun, 28 May 2023 21:36:28 +1200 Subject: [PATCH 1/4] add pedantic whitespace mode --- tinyxml2.cpp | 22 +++++-- tinyxml2.h | 5 +- xmltest.cpp | 172 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 192 insertions(+), 7 deletions(-) diff --git a/tinyxml2.cpp b/tinyxml2.cpp index 4b561b3d..901e3957 100755 --- a/tinyxml2.cpp +++ b/tinyxml2.cpp @@ -715,7 +715,7 @@ bool XMLUtil::ToUnsigned64(const char* str, uint64_t* value) { } -char* XMLDocument::Identify( char* p, XMLNode** node ) +char* XMLDocument::Identify( char* p, XMLNode** node, bool first ) { TIXMLASSERT( node ); TIXMLASSERT( p ); @@ -767,9 +767,19 @@ char* XMLDocument::Identify( char* p, XMLNode** node ) p += dtdHeaderLen; } else if ( XMLUtil::StringEqual( p, elementHeader, elementHeaderLen ) ) { - returnNode = CreateUnlinkedNode( _elementPool ); - returnNode->_parseLineNum = _parseCurLineNum; - p += elementHeaderLen; + + // Preserve whitespace pedantically before closing tag, when it's immediately after opening tag + if (WhitespaceMode() == PEDANTIC_WHITESPACE && first && p != start && *(p + elementHeaderLen) == '/') { + returnNode = CreateUnlinkedNode(_textPool); + returnNode->_parseLineNum = startLine; + p = start; // Back it up, all the text counts. + _parseCurLineNum = startLine; + } + else { + returnNode = CreateUnlinkedNode(_elementPool); + returnNode->_parseLineNum = _parseCurLineNum; + p += elementHeaderLen; + } } else { returnNode = CreateUnlinkedNode( _textPool ); @@ -1070,14 +1080,16 @@ char* XMLNode::ParseDeep( char* p, StrPair* parentEndTag, int* curLineNumPtr ) if (_document->Error()) return 0; + bool first = true; while( p && *p ) { XMLNode* node = 0; - p = _document->Identify( p, &node ); + p = _document->Identify( p, &node, first ); TIXMLASSERT( p ); if ( node == 0 ) { break; } + first = false; const int initialLineNum = node->_parseLineNum; diff --git a/tinyxml2.h b/tinyxml2.h index b0c8b6c0..45d6980c 100755 --- a/tinyxml2.h +++ b/tinyxml2.h @@ -1704,7 +1704,8 @@ class TINYXML2_LIB XMLElement : public XMLNode enum Whitespace { PRESERVE_WHITESPACE, - COLLAPSE_WHITESPACE + COLLAPSE_WHITESPACE, + PEDANTIC_WHITESPACE }; @@ -1915,7 +1916,7 @@ class TINYXML2_LIB XMLDocument : public XMLNode void DeepCopy(XMLDocument* target) const; // internal - char* Identify( char* p, XMLNode** node ); + char* Identify( char* p, XMLNode** node, bool first ); // internal void MarkInUse(const XMLNode* const); diff --git a/xmltest.cpp b/xmltest.cpp index c3ce079e..ae976042 100755 --- a/xmltest.cpp +++ b/xmltest.cpp @@ -1869,6 +1869,178 @@ int main( int argc, const char ** argv ) XMLTest( "Whitespace all space", true, 0 == doc.FirstChildElement()->FirstChild() ); } + // ----------- Preserve Whitespace ------------ + { + const char* xml = "This is ' \n\n text '"; + XMLDocument doc(true, PRESERVE_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with whitespace preserved", false, doc.Error()); + XMLTest("Whitespace preserved", "This is ' \n\n text '", doc.FirstChildElement()->GetText()); + } + + { + const char* xml = " This \nis ' text ' "; + XMLDocument doc(true, PRESERVE_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with whitespace preserved", false, doc.Error()); + XMLTest("Whitespace preserved", " This \nis ' text ' ", doc.FirstChildElement()->GetText()); + } + + { + const char* xml = " \n This is ' text ' \n"; + XMLDocument doc(true, PRESERVE_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with whitespace preserved", false, doc.Error()); + XMLTest("Whitespace preserved", " \n This is ' text ' \n", doc.FirstChildElement()->GetText()); + } + + // Following cases are for text that is all whitespace which are not preserved intentionally + { + const char* xml = " "; + XMLDocument doc(true, PRESERVE_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with whitespace preserved", false, doc.Error()); + XMLTest("Whitespace preserved", true, 0 == doc.FirstChildElement()->GetText()); + } + + { + const char* xml = " "; + XMLDocument doc(true, PRESERVE_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with whitespace preserved", false, doc.Error()); + XMLTest("Whitespace preserved", true, 0 == doc.FirstChildElement()->GetText()); + } + + { + const char* xml = "\n\n"; + XMLDocument doc(true, PRESERVE_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with whitespace preserved", false, doc.Error()); + XMLTest("Whitespace preserved", true, 0 == doc.FirstChildElement()->GetText()); + } + + { + const char* xml = " \n"; + XMLDocument doc(true, PRESERVE_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with whitespace preserved", false, doc.Error()); + XMLTest("Whitespace preserved", true, 0 == doc.FirstChildElement()->GetText()); + } + + { + const char* xml = " \n \n "; + XMLDocument doc(true, PRESERVE_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with whitespace preserved", false, doc.Error()); + XMLTest("Whitespace preserved", true, 0 == doc.FirstChildElement()->GetText()); + } + + // ----------- Pedantic Whitespace ------------ + { + const char* xml = "This is ' \n\n text '"; + XMLDocument doc(true, PEDANTIC_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with pedantic whitespace", false, doc.Error()); + XMLTest("Pedantic whitespace", "This is ' \n\n text '", doc.FirstChildElement()->GetText()); + } + + { + const char* xml = " This \nis ' text ' "; + XMLDocument doc(true, PEDANTIC_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with pedantic whitespace", false, doc.Error()); + XMLTest("Pedantic whitespace", " This \nis ' text ' ", doc.FirstChildElement()->GetText()); + } + + { + const char* xml = " \n This is ' text ' \n"; + XMLDocument doc(true, PEDANTIC_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with pedantic whitespace", false, doc.Error()); + XMLTest("Pedantic whitespace", " \n This is ' text ' \n", doc.FirstChildElement()->GetText()); + } + + // Following cases are for text that is all whitespace which is preserved with pedantic mode + { + const char* xml = " "; + XMLDocument doc(true, PEDANTIC_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with pedantic whitespace", false, doc.Error()); + XMLTest("Pedantic whitespace", " ", doc.FirstChildElement()->GetText()); + } + + { + const char* xml = " "; + XMLDocument doc(true, PEDANTIC_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with pedantic whitespace", false, doc.Error()); + XMLTest("Pedantic whitespace", " ", doc.FirstChildElement()->GetText()); + } + + { + const char* xml = "\n\n\n"; + XMLDocument doc(true, PEDANTIC_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with pedantic whitespace", false, doc.Error()); + XMLTest("Pedantic whitespace", "\n\n", doc.FirstChildElement()->GetText()); + } + + { + const char* xml = " \n \n "; + XMLDocument doc(true, PEDANTIC_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with pedantic whitespace", false, doc.Error()); + XMLTest("Pedantic whitespace", " \n", doc.FirstChildElement()->GetText()); + } + + { + const char* xml = " \n \n "; + XMLDocument doc(true, PEDANTIC_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse with pedantic whitespace", false, doc.Error()); + XMLTest("Pedantic whitespace", " \n \n ", doc.FirstChildElement()->GetText()); + } + + // Following cases are for checking nested elements are still parsed with pedantic whitespace + { + const char* xml = "\n\t This is nested text \n "; + XMLDocument doc(true, PEDANTIC_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse nested elements with pedantic whitespace", false, doc.Error()); + XMLTest("Pedantic whitespace", " This is nested text ", doc.RootElement()->FirstChildElement()->GetText()); + } + + { + const char* xml = " \n"; + XMLDocument doc(true, PEDANTIC_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse nested elements with pedantic whitespace", false, doc.Error()); + XMLTest("Pedantic whitespace", " ", doc.RootElement()->FirstChildElement()->GetText()); + } + + { + const char* xml = " \n "; + XMLDocument doc(true, PEDANTIC_WHITESPACE); + doc.Parse(xml); + XMLTest("Parse nested elements with pedantic whitespace", false, doc.Error()); + XMLTest("Pedantic whitespace", true, 0 == doc.RootElement()->FirstChildElement()->GetText()); + } + + // Check sample xml can be parsed with pedantic mode + { + XMLDocument doc(true, PEDANTIC_WHITESPACE); + doc.LoadFile("resources/dream.xml"); + XMLTest("Load dream.xml with pedantic whitespace mode", false, doc.Error()); + + XMLTest("Dream", "xml version=\"1.0\"", + doc.FirstChild()->ToDeclaration()->Value()); + XMLTest("Dream", true, doc.FirstChild()->NextSibling()->ToUnknown() != 0); + XMLTest("Dream", "DOCTYPE PLAY SYSTEM \"play.dtd\"", + doc.FirstChild()->NextSibling()->ToUnknown()->Value()); + XMLTest("Dream", "And Robin shall restore amends.", + doc.LastChild()->LastChild()->LastChild()->LastChild()->LastChildElement()->GetText()); + } + { // An assert should not fire. const char* xml = ""; From 428cd1be420f99f8af5552bf1e75a1eb2474ae49 Mon Sep 17 00:00:00 2001 From: Lee Thomason Date: Tue, 21 Nov 2023 11:52:44 -0800 Subject: [PATCH 2/4] update readme --- readme.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/readme.md b/readme.md index 5245ac4e..5cc158c4 100644 --- a/readme.md +++ b/readme.md @@ -93,7 +93,7 @@ by the Document. When the Document is deleted, so are all the nodes it contains. ### White Space -#### Whitespace Preservation (default) +#### Whitespace Preservation (default, PRESERVE_WHITESPACE) Microsoft has an excellent article on white space: http://msdn.microsoft.com/en-us/library/ms256097.aspx @@ -125,7 +125,7 @@ valuable. TinyXML-2 sees these as the same XML: 123 -#### Whitespace Collapse +#### Whitespace Collapse (COLLAPSE_WHITESPACE) For some applications, it is preferable to collapse whitespace. Collapsing whitespace gives you "HTML-like" behavior, which is sometimes more suitable @@ -143,7 +143,15 @@ However, you may also use COLLAPSE_WHITESPACE, which will: Note that (currently) there is a performance impact for using COLLAPSE_WHITESPACE. It essentially causes the XML to be parsed twice. -#### Error Reporting +#### Pedantic Whitespace (PEDANTIC_WHITESPACE) + +For applications that need to know about text nodes that are composed entirely of +whitespace, PEDANTIC_WHITESPACE is available. PEDANTIC_WHITESPACE maintains all the +whilespace between elements. + +PEDANTIC_WHITESPACE is a new mode and not as tested as the other whitespace modes. + +### Error Reporting TinyXML-2 reports the line number of any errors in an XML document that cannot be parsed correctly. In addition, all nodes (elements, declarations, From 941e2d9018af2d689e3c79a4e5310be617b23e19 Mon Sep 17 00:00:00 2001 From: Lee Thomason Date: Tue, 21 Nov 2023 11:58:20 -0800 Subject: [PATCH 3/4] fix the tests? --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9909120e..74d5e1c1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,7 +6,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ windows-2019, macos-10.15, ubuntu-20.04 ] + os: [ windows-2019, macos-latest, ubuntu-20.04 ] cmake: [ 3.15, 3.x ] include: - os: windows-2019 @@ -17,7 +17,7 @@ jobs: - os: ubuntu-20.04 tree: tree - - os: macos-10.15 + - os: macos-latest tree: find - cmake: 3.15 From 8d3cdf50dbfe388aadc0fdacfd026c2a9f22d9d4 Mon Sep 17 00:00:00 2001 From: Lee Thomason Date: Tue, 21 Nov 2023 12:02:26 -0800 Subject: [PATCH 4/4] now fix the badge --- readme.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/readme.md b/readme.md index 5cc158c4..81fbe68c 100644 --- a/readme.md +++ b/readme.md @@ -1,9 +1,7 @@ TinyXML-2 ========= -![Build](https://github.com/leethomason/tinyxml2/actions/workflows/test.yml/badge.svg) - -![TinyXML-2 Logo](http://www.grinninglizard.com/tinyxml2/TinyXML2_small.png) +[![Test](https://github.com/leethomason/tinyxml2/actions/workflows/test.yml/badge.svg)](https://github.com/leethomason/tinyxml2/actions/workflows/test.yml) TinyXML-2 is a simple, small, efficient, C++ XML parser that can be easily integrated into other programs.