Add support for 'xml' keyword in liblognorm.

rsyslog · Nov 15, 2021 · 3d0260d · 3d0260d
1 parent 1e18f60
commit 3d0260d
Show file tree

Hide file tree

Showing 9 changed files with 218 additions and 3 deletions.
diff --git a/configure.ac b/configure.ac
@@ -88,6 +88,28 @@ else
 fi
 AC_SUBST(FEATURE_REGEXP)
 
+# XML parsing
+AC_ARG_ENABLE(xml,
+        [AS_HELP_STRING([--enable-xml],[Enable XML parsing @<:@default=no@:>@])],
+        [case "${enableval}" in
+         yes) enable_xml="yes" ;;
+          no) enable_xml="no" ;;
+           *) AC_MSG_ERROR(bad value ${enableval} for --enable-xml) ;;
+         esac],
+        [enable_xml="no"]
+)
+AM_CONDITIONAL(ENABLE_XML, test x$enable_xml = xyes)
+if test "$enable_xml" = "yes"; then
+        PKG_CHECK_MODULES(LIBXML2, libxml2,,
+                [PKG_CHECK_MODULES(LIBXML2, libxml-2.0,,)]
+        )
+        AC_DEFINE(FEATURE_XML, 1, [XML parsing support enabled.])
+	FEATURE_XML=1
+else
+	FEATURE_XML=0
+fi
+AC_SUBST(FEATURE_XML)
+
 # debug mode settings
 AC_ARG_ENABLE(debug,
         [AS_HELP_STRING([--enable-debug],[Enable debug mode @<:@default=no@:>@])],
@@ -189,6 +211,7 @@ echo "*****************************************************"
 echo "liblognorm will be compiled with the following settings:"
 echo
 echo "Regex enabled:               $enable_regexp"
+echo "XML enabled:                 $enable_xml"
 echo "Advanced Statistics enabled: $enable_advstats"
 echo "Testbench enabled:           $enable_testbench"
 echo "Valgrind enabled:            $enable_valgrind"

diff --git a/src/Makefile.am b/src/Makefile.am
@@ -42,8 +42,8 @@ liblognorm_la_SOURCES += \
 	v1_ptree.c \
 	v1_samp.c
 
-liblognorm_la_CPPFLAGS = $(JSON_C_CFLAGS) $(WARN_CFLAGS) $(LIBESTR_CFLAGS) $(PCRE_CFLAGS)
-liblognorm_la_LIBADD = $(rt_libs) $(JSON_C_LIBS) $(LIBESTR_LIBS) $(PCRE_LIBS) -lestr
+liblognorm_la_CPPFLAGS = $(JSON_C_CFLAGS) $(WARN_CFLAGS) $(LIBESTR_CFLAGS) $(PCRE_CFLAGS) $(LIBXML2_CFLAGS)
+liblognorm_la_LIBADD = $(rt_libs) $(JSON_C_LIBS) $(LIBESTR_LIBS) $(PCRE_LIBS) $(LIBXML2_LIBS) -lestr
 # info on version-info:
 # http://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html
 # Note: v2 now starts at version 5, as v1 previously also had 4

diff --git a/src/parser.c b/src/parser.c
@@ -47,6 +47,11 @@
 #include <errno.h>
 #endif
 
+#ifdef FEATURE_XML
+#include <libxml/xmlmemory.h>
+#include <libxml/parser.h>
+#endif
+
 
 /* how should output values be formatted? */
 enum FMT_MODE {
@@ -75,6 +80,41 @@ hParseInt(const unsigned char **buf, size_t *lenBuf)
 	return i;
 }
 
+
+#ifdef FEATURE_XML
+/* Credits to https://github.com/katie-snow/xml2json-c
+   This code is under GPL-3.0 License
+*/
+static inline void
+xml2jsonc_convert_elements(xmlNode *anode, json_object *jobj)
+{
+    xmlNode *cur_node = NULL;
+    json_object *cur_jobj = NULL;
+    json_object *cur_jstr = NULL;
+
+    for (cur_node = anode; cur_node; cur_node = cur_node->next)
+    {
+        if (cur_node->type == XML_ELEMENT_NODE)
+        {
+            if (xmlChildElementCount(cur_node) == 0)
+            {
+                /* JSON string object */
+                cur_jobj = json_object_new_object();
+                cur_jstr = json_object_new_string((const char *)xmlNodeGetContent(cur_node));
+                json_object_object_add(jobj, (const char *)cur_node->name, cur_jstr);
+            }
+            else
+            {
+                /* JSON object */
+                cur_jobj = json_object_new_object();
+                json_object_object_add(jobj, (const char *)cur_node->name, json_object_get(cur_jobj));
+            }
+        }
+        xml2jsonc_convert_elements(cur_node->children, cur_jobj);
+    }
+}
+#endif /* #ifdef FEATURE_XML */
+
 /* parser _parse interface
  *
  * All parsers receive
@@ -2325,6 +2365,71 @@ PARSER_Parse(v2IPTables)
 	return r;
 }
 
+#ifdef FEATURE_XML
+/**
+ * Parse XML. This parser tries to find XML data inside a message.
+ * If it finds valid XML, it will extract it.
+ *
+ * Note: The XML Parser expects a string that begins with '<' and
+ * ends with '>'. whitespace or any other character at the
+ * beginning or at the end of the string will cause a parse failure
+ *
+ * Note: Is there is extra content after the XML content
+ * the parser will fail. A hack consist of finding the
+ * last '>' in the string and ignore the rest.
+ *
+ * added 2021-02-01 by [email protected]
+ */
+PARSER_Parse(XML)
+        xmlDocPtr doc = NULL;
+        xmlNodePtr root_element = NULL;
+
+        /* Find the last occurence of '>' in the string */
+        char * pch;
+        pch=strrchr((const char *) npb->str + *offs, '>');
+
+        /* Truncate the string after the last occurence of '>' */
+        int newLen = pch - (npb->str + *offs) + 1;
+        char *cstr = strndup(npb->str + *offs, newLen);
+        CHKN(cstr);
+
+        doc=xmlParseDoc((xmlChar*) cstr);
+        free(cstr);
+
+        /* Invalid XML string */
+        if (doc == NULL) {
+            goto done;
+        }
+
+        /* Now convert XML document into JSON document */
+        root_element = xmlDocGetRootElement(doc);
+        json_object *json = NULL;
+        json = json_object_new_object();
+        xml2jsonc_convert_elements(root_element, json);
+
+        if(json == NULL)
+                goto done;
+
+        /* parsing OK */
+        *parsed = newLen ;
+        r = 0;
+
+        if(value == NULL) {
+                json_object_put(json);
+        } else {
+                *value = json;
+        }
+
+done:
+        if(doc != NULL)
+            xmlFreeDoc(doc);
+        xmlCleanupParser();
+        return r;
+}
+#endif /* #ifdef FEATURE_XML */
+
+
+
 /**
  * Parse JSON. This parser tries to find JSON data inside a message.
  * If it finds valid JSON, it will extract it. Extra data after the

diff --git a/src/parser.h b/src/parser.h
@@ -78,6 +78,9 @@ PARSERDEF_NO_DATA(MAC48);
 PARSERDEF_NO_DATA(CEF);
 PARSERDEF(CheckpointLEA);
 PARSERDEF(NameValue);
+#ifdef FEATURE_XML
+PARSERDEF_NO_DATA(XML);
+#endif
 
 #undef PARSERDEF_NO_DATA
 

diff --git a/src/pdag.c b/src/pdag.c
@@ -99,7 +99,10 @@ static struct ln_parser_info parser_lookup_table[] = {
 	PARSER_ENTRY("string-to", StringTo, 32),
 	PARSER_ENTRY("char-to", CharTo, 32),
 	PARSER_ENTRY("char-sep", CharSeparated, 32),
-	PARSER_ENTRY("string", String, 32)
+	PARSER_ENTRY("string", String, 32),
+#ifdef FEATURE_XML
+        PARSER_ENTRY_NO_DATA("xml", XML, 4),
+#endif
 };
 #define NPARSERS (sizeof(parser_lookup_table)/sizeof(struct ln_parser_info))
 #define DFLT_USR_PARSER_PRIO 30000 /**< default priority if user has not specified it */

diff --git a/src/v1_parser.c b/src/v1_parser.c
@@ -63,6 +63,7 @@ hParseInt(const unsigned char **buf, size_t *lenBuf)
 	return i;
 }
 
+
 /* parsers for the primitive types
  *
  * All parsers receive

diff --git a/tests/Makefile.am b/tests/Makefile.am
@@ -158,6 +158,10 @@ REGEXP_TESTS = \
 	field_tokenized_with_regex.sh \
 	field_regex_while_regex_support_is_disabled.sh
 
+XML_TESTS = \
+	field_xml.sh \
+	field_xml_jsoncnf.sh
+
 EXTRA_DIST = exec.sh \
 	$(TESTS_SHELLSCRIPTS) \
 	$(REGEXP_TESTS) \
@@ -167,3 +171,7 @@ EXTRA_DIST = exec.sh \
 if ENABLE_REGEXP
 TESTS += $(REGEXP_TESTS)
 endif
+
+if ENABLE_XML
+TESTS += $(XML_TESTS)
+endif
diff --git a/tests/field_xml.sh b/tests/field_xml.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# added 2021-11-14 by Theo Bertin
+# This file is part of the liblognorm project, released under ASL 2.0
+. $srcdir/exec.sh
+
+test_def $0 "XML field"
+add_rule 'version=2'
+add_rule 'rule=:%field:xml%'
+
+execute '<?xml version="1.0" encoding="UTF-8"?><note>This is a simple note</note>'
+assert_output_json_eq '{ "field": { "note": "This is a simple note"} }'
+
+execute '<?xml version="1.0" encoding="UTF-8"?><note><one>first note</one><two>second note</two></note>'
+assert_output_json_eq '{ "field": { "note": { "one": "first note", "two": "second note" } } }'
+
+# execute '@cee: {"f1": "1", "f2": 2}'
+# assert_output_json_eq '{ "field": { "f1": "1", "f2": 2 } }'
+
+# execute '@cee:     {"f1": "1", "f2": 2}'
+# assert_output_json_eq '{ "field": { "f1": "1", "f2": 2 } }'
+
+#
+# Things that MUST NOT work
+#
+execute '<?xml version="1.0" encoding="UTF-8"?><note>This is a simple note</note> ' # note the trailing space
+assert_output_json_eq '{ "originalmsg": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><note>This is a simple note<\/note> ", "unparsed-data": " " }'
+
+execute '<?xml version="1.0" encoding="UTF-8"?><note>This is a simple note'
+assert_output_json_eq '{ "originalmsg": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><note>This is a simple note", "unparsed-data": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><note>This is a simple note" }'
+
+execute '<?xml version="1.0" encoding="UTF-8"?><note>This is a simple note</note2>'
+assert_output_json_eq '{ "originalmsg": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><note>This is a simple note</note2>", "unparsed-data": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><note>This is a simple note</note2>" }'
+
+
+cleanup_tmp_files
+
diff --git a/tests/field_xml_jsoncnf.sh b/tests/field_xml_jsoncnf.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# added 2021-11-14 by Theo Bertin
+# This file is part of the liblognorm project, released under ASL 2.0
+. $srcdir/exec.sh
+
+test_def $0 "XML field"
+add_rule 'version=2'
+add_rule 'rule=:%{"name":"field", "type":"xml"}%'
+
+execute '<?xml version="1.0" encoding="UTF-8"?><note>This is a simple note</note>'
+assert_output_json_eq '{ "field": { "note": "This is a simple note"} }'
+
+execute '<?xml version="1.0" encoding="UTF-8"?><note><one>first note</one><two>second note</two></note>'
+assert_output_json_eq '{ "field": { "note": { "one": "first note", "two": "second note" } } }'
+
+# execute '@cee: {"f1": "1", "f2": 2}'
+# assert_output_json_eq '{ "field": { "f1": "1", "f2": 2 } }'
+
+# execute '@cee:     {"f1": "1", "f2": 2}'
+# assert_output_json_eq '{ "field": { "f1": "1", "f2": 2 } }'
+
+#
+# Things that MUST NOT work
+#
+execute '<?xml version="1.0" encoding="UTF-8"?><note>This is a simple note</note> ' # note the trailing space
+assert_output_json_eq '{ "originalmsg": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><note>This is a simple note<\/note> ", "unparsed-data": " " }'
+
+execute '<?xml version="1.0" encoding="UTF-8"?><note>This is a simple note'
+assert_output_json_eq '{ "originalmsg": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><note>This is a simple note", "unparsed-data": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><note>This is a simple note" }'
+
+execute '<?xml version="1.0" encoding="UTF-8"?><note>This is a simple note</note2>'
+assert_output_json_eq '{ "originalmsg": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><note>This is a simple note</note2>", "unparsed-data": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><note>This is a simple note</note2>" }'
+
+
+cleanup_tmp_files
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -63,6 +63,7 @@ hParseInt(const unsigned char **buf, size_t *lenBuf) @@
     	return i;
     }
     /* parsers for the primitive types
      *
      * All parsers receive
@@ Expand Down @@