From 9150753a2041aaa0caacc8b1def7c23cbcd9745d Mon Sep 17 00:00:00 2001 From: longphan98 Date: Wed, 25 May 2022 13:38:44 +0700 Subject: [PATCH] [TIKA-1800] Decode the escape characters in front of special characters --- .../main/java/org/apache/tika/mime/MediaType.java | 15 ++++++++++++--- .../java/org/apache/tika/mime/MediaTypeTest.java | 13 +++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java index 13ad6ed9cd..170431752f 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java @@ -39,15 +39,15 @@ public final class MediaType implements Comparable, Serializable { */ private static final long serialVersionUID = -3831000556189036392L; - private static final Pattern SPECIAL = Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=]"); + private static final Pattern SPECIAL = Pattern.compile("[()<>@,;:\\\\\"/\\[\\]?=]"); private static final Pattern SPECIAL_OR_WHITESPACE = - Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]"); + Pattern.compile("[()<>@,;:\\\\\"/\\[\\]?=\\s]"); /** * See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters. */ - private static final String VALID_CHARS = "([^\\c\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]+)"; + private static final String VALID_CHARS = "([^\\c\\()<>@,;:\\\\\"/\\[\\]?=\\s]+)"; private static final Pattern TYPE_PATTERN = Pattern.compile("(?s)\\s*" + VALID_CHARS + "\\s*/\\s*" + VALID_CHARS + "\\s*($|;.*)"); @@ -331,6 +331,7 @@ private static Map parseParameters(String string) { /** * Fuzzy unquoting mechanism that works also with somewhat malformed * quotes. + * TIKA-1800: get rid of the escape characters which are in front of special characters for further usage (.e.g to parse on to a new MediaType as parameters) * * @param s string to unquote * @return unquoted string @@ -342,6 +343,14 @@ private static String unquote(String s) { while (s.endsWith("\"") || s.endsWith("'")) { s = s.substring(0, s.length() - 1); } + for (int i = 0; i < s.length() - 1; i++) { + if (s.charAt(i) == '\\' && !('0' <= s.charAt(i + 1) && s.charAt(i + 1) <= '9') && + !('a' <= s.charAt(i + 1) && s.charAt(i + 1) <= 'z') && + s.charAt(i + 1) != '-' && s.charAt(i + 1) != '+' && + s.charAt(i + 1) != '.' && s.charAt(i + 1) != '_') { + s = s.substring(0, i) + s.substring(i + 1); + } + } return s; } diff --git a/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java b/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java index 64a2bebf82..47aa52abf9 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java @@ -21,13 +21,26 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; +import java.io.IOException; import java.util.HashMap; import java.util.Map; +import org.apache.tika.exception.TikaException; import org.junit.jupiter.api.Test; +import org.xml.sax.SAXException; public class MediaTypeTest { + // TIKA-1800 + @Test + public void testEscapedSpecialChar() { + MediaType mType = new MediaType(MediaType.APPLICATION_XML, "x-report", "#report?"); + String cType = mType.toString(); + mType = MediaType.parse(cType); + String report = mType.getParameters().get("x-report"); + assertEquals("#report?", report); + } + @Test public void testBasics() { assertEquals("application/octet-stream",