Skip to content

Commit

Permalink
TIKA-1599 (#1356)
Browse files Browse the repository at this point in the history
* TIKA-1599 -- migrate to jsoup parser
  • Loading branch information
tballison authored Sep 26, 2023
1 parent caf3125 commit 5361b6d
Show file tree
Hide file tree
Showing 74 changed files with 7,727 additions and 126 deletions.
9 changes: 8 additions & 1 deletion CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,18 @@ Release 3.0.0-BETA - ??

* Require Java 11 (TIKA-4128).

* The boilerpipe handler has been moved to tika-handler-boiler-pipe
* The boilerpipe handler has been moved to the tika-handler-boiler-pipe
package (TIKA-4138).

* We've migrated HTML parsing to the JSoup parser instead of TagSoup. If
you have a custom configuration on the HTMLParser, you'll need to change
that to o.a.t.p.html.JSoupParser (TIKA-1599). The TagSoup parser is still
available in the tika-parser-tagsoup-module if you prefer the legacy parser.

* Removed xerces2 as a dependency (TIKA-4135).

Other Changes/Updates

* Fix bug in DateUtils that stripped timezone information from
incoming Calendar objects (TIKA-4126).

Expand Down
2 changes: 2 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@
<module>tika-java7</module>
<module>tika-detectors</module>
<module>tika-handlers</module>
<module>tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module</module>
<module>tika-parsers/tika-parsers-extended/tika-parser-tagsoup-package</module>
</modules>

<profiles>
Expand Down
4 changes: 2 additions & 2 deletions tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ public void testJsonMetadataPrettyPrintOutput() throws Exception {

assertTrue(json.contains(
"\"X-TIKA:Parsed-By\" : [ \"org.apache.tika.parser.DefaultParser\", " +
"\"org.apache.tika.parser.html.HtmlParser\" ],"));
"\"org.apache.tika.parser.html.JSoupParser\" ],"));
//test legacy alphabetic sort of keys
int enc = json.indexOf("\"Content-Encoding\"");
int fb = json.indexOf("fb:admins");
Expand Down Expand Up @@ -467,7 +467,7 @@ public void testConfig() throws Exception {
getParamOutContent("--config=" + TEST_DATA_FILE.toString() + "/tika-config1.xml",
resourcePrefix + "bad_xml.xml");
assertTrue(content.contains("apple"));
assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser"));
assertTrue(content.contains("org.apache.tika.parser.html.JSoupParser"));
}

@Test
Expand Down
2 changes: 1 addition & 1 deletion tika-app/src/test/resources/test-data/tika-config1.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<properties>
<parsers>
<parser class="org.apache.tika.parser.html.HtmlParser">
<parser class="org.apache.tika.parser.html.JSoupParser">
<mime>application/vnd.wap.xhtml+xml</mime>
<mime>application/x-asp</mime>
<mime>application/xhtml+xml</mime>
Expand Down
11 changes: 10 additions & 1 deletion tika-bom/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,11 @@
<artifactId>tika-parser-sqlite3-package</artifactId>
<version>3.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parser-tagsoup-package</artifactId>
<version>3.0.0-SNAPSHOT</version>
</dependency>

<!-- Tika parsers modules (extended package) -->
<dependency>
Expand All @@ -269,7 +274,11 @@
<artifactId>tika-parser-sqlite3-module</artifactId>
<version>3.0.0-SNAPSHOT</version>
</dependency>

<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parser-tagsoup-module</artifactId>
<version>3.0.0-SNAPSHOT</version>
</dependency>
<!-- Tika parsers modules (ML package) -->
<dependency>
<groupId>org.apache.tika</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.parser.html.IdentityHtmlMapper;
import org.apache.tika.parser.html.JSoupParser;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.parser.xml.XMLParser;
import org.apache.tika.sax.BodyContentHandler;
Expand Down Expand Up @@ -117,7 +117,7 @@ public static void useHtmlParser() throws Exception {
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
Parser parser = new HtmlParser();
Parser parser = new JSoupParser();
parser.parse(stream, handler, metadata, context);
}

Expand All @@ -126,7 +126,7 @@ public static void useCompositeParser() throws Exception {
ContentHandler handler = new DefaultHandler();
ParseContext context = new ParseContext();
Map<MediaType, Parser> parsersByType = new HashMap<>();
parsersByType.put(MediaType.parse("text/html"), new HtmlParser());
parsersByType.put(MediaType.parse("text/html"), new JSoupParser());
parsersByType.put(MediaType.parse("application/xml"), new XMLParser());

CompositeParser parser = new CompositeParser();
Expand Down
5 changes: 5 additions & 0 deletions tika-parent/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,11 @@
<type>pom</type>
<scope>import</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.16.1</version>
</dependency>
<dependency>
<groupId>org.junit</groupId>
<artifactId>junit-bom</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.document class or
application/vnd.openxmlformats-officedocument.wordprocessingml.template class org.apache.tika.parser.microsoft.ooxml.OOXMLParser
application/vnd.sun.xml.writer class org.apache.tika.parser.odf.OpenDocumentParser
application/vnd.visio class org.apache.tika.parser.microsoft.OfficeParser
application/vnd.wap.xhtml+xml class org.apache.tika.parser.html.HtmlParser
application/vnd.wap.xhtml+xml class org.apache.tika.parser.html.JSoupParser
application/vnd.wordperfect; version=5.0 class org.apache.tika.parser.wordperfect.WordPerfectParser
application/vnd.wordperfect; version=5.1 class org.apache.tika.parser.wordperfect.WordPerfectParser
application/vnd.wordperfect; version=6.x class org.apache.tika.parser.wordperfect.WordPerfectParser
Expand All @@ -121,7 +121,7 @@ application/x-7z-compressed class org.apache.tika.parser.pkg.PackageParser
application/x-ace2 class org.apache.tika.parser.gdal.GDALParser
application/x-archive class org.apache.tika.parser.pkg.PackageParser
application/x-arj class org.apache.tika.parser.pkg.PackageParser
application/x-asp class org.apache.tika.parser.html.HtmlParser
application/x-asp class org.apache.tika.parser.html.JSoupParser
application/x-bag class org.apache.tika.parser.gdal.GDALParser
application/x-blx class org.apache.tika.parser.gdal.GDALParser
application/x-bplist class org.apache.tika.parser.apple.PListParser
Expand Down Expand Up @@ -248,7 +248,7 @@ application/x-xliff+zip class org.apache.tika.parser.xliff.XLZParser
application/x-xyz class org.apache.tika.parser.gdal.GDALParser
application/x-xz class org.apache.tika.parser.pkg.CompressorParser
application/x-zmap class org.apache.tika.parser.gdal.GDALParser
application/xhtml+xml class org.apache.tika.parser.html.HtmlParser
application/xhtml+xml class org.apache.tika.parser.html.JSoupParser
application/xml class org.apache.tika.parser.xml.DcXMLParser
application/xpm class org.apache.tika.parser.gdal.GDALParser
application/zip class org.apache.tika.parser.pkg.PackageParser
Expand Down Expand Up @@ -322,7 +322,7 @@ image/x-xcf class org.apache.tika.parser.image.ImageParser
message/rfc822 class org.apache.tika.parser.mail.RFC822Parser
model/vnd.dwfx+xps class org.apache.tika.parser.microsoft.ooxml.OOXMLParser
text/csv class org.apache.tika.parser.csv.TextAndCSVParser
text/html class org.apache.tika.parser.html.HtmlParser
text/html class org.apache.tika.parser.html.JSoupParser
text/iso19139+xml class org.apache.tika.parser.geoinfo.GeographicInformationParser
text/plain class org.apache.tika.parser.csv.TextAndCSVParser
text/tsv class org.apache.tika.parser.csv.TextAndCSVParser
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.document class or
application/vnd.openxmlformats-officedocument.wordprocessingml.template class org.apache.tika.parser.microsoft.ooxml.OOXMLParser
application/vnd.sun.xml.writer class org.apache.tika.parser.odf.OpenDocumentParser
application/vnd.visio class org.apache.tika.parser.microsoft.OfficeParser
application/vnd.wap.xhtml+xml class org.apache.tika.parser.html.HtmlParser
application/vnd.wap.xhtml+xml class org.apache.tika.parser.html.JSoupParser
application/vnd.wordperfect; version=5.0 class org.apache.tika.parser.wordperfect.WordPerfectParser
application/vnd.wordperfect; version=5.1 class org.apache.tika.parser.wordperfect.WordPerfectParser
application/vnd.wordperfect; version=6.x class org.apache.tika.parser.wordperfect.WordPerfectParser
Expand All @@ -121,7 +121,7 @@ application/x-7z-compressed class org.apache.tika.parser.pkg.PackageParser
application/x-ace2 class org.apache.tika.parser.gdal.GDALParser
application/x-archive class org.apache.tika.parser.pkg.PackageParser
application/x-arj class org.apache.tika.parser.pkg.PackageParser
application/x-asp class org.apache.tika.parser.html.HtmlParser
application/x-asp class org.apache.tika.parser.html.JSoupParser
application/x-bag class org.apache.tika.parser.gdal.GDALParser
application/x-blx class org.apache.tika.parser.gdal.GDALParser
application/x-bplist class org.apache.tika.parser.apple.PListParser
Expand Down Expand Up @@ -248,7 +248,7 @@ application/x-xliff+zip class org.apache.tika.parser.xliff.XLZParser
application/x-xyz class org.apache.tika.parser.gdal.GDALParser
application/x-xz class org.apache.tika.parser.pkg.CompressorParser
application/x-zmap class org.apache.tika.parser.gdal.GDALParser
application/xhtml+xml class org.apache.tika.parser.html.HtmlParser
application/xhtml+xml class org.apache.tika.parser.html.JSoupParser
application/xml class org.apache.tika.parser.xml.DcXMLParser
application/xpm class org.apache.tika.parser.gdal.GDALParser
application/zip class org.apache.tika.parser.pkg.PackageParser
Expand Down Expand Up @@ -332,7 +332,7 @@ image/x-xcf class org.apache.tika.parser.image.ImageParser
message/rfc822 class org.apache.tika.parser.mail.RFC822Parser
model/vnd.dwfx+xps class org.apache.tika.parser.microsoft.ooxml.OOXMLParser
text/csv class org.apache.tika.parser.csv.TextAndCSVParser
text/html class org.apache.tika.parser.html.HtmlParser
text/html class org.apache.tika.parser.html.JSoupParser
text/iso19139+xml class org.apache.tika.parser.geoinfo.GeographicInformationParser
text/plain class org.apache.tika.parser.csv.TextAndCSVParser
text/tsv class org.apache.tika.parser.csv.TextAndCSVParser
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.document class or
application/vnd.openxmlformats-officedocument.wordprocessingml.template class org.apache.tika.parser.microsoft.ooxml.OOXMLParser
application/vnd.sun.xml.writer class org.apache.tika.parser.odf.OpenDocumentParser
application/vnd.visio class org.apache.tika.parser.microsoft.OfficeParser
application/vnd.wap.xhtml+xml class org.apache.tika.parser.html.HtmlParser
application/vnd.wap.xhtml+xml class org.apache.tika.parser.html.JSoupParser
application/vnd.wordperfect; version=5.0 class org.apache.tika.parser.wordperfect.WordPerfectParser
application/vnd.wordperfect; version=5.1 class org.apache.tika.parser.wordperfect.WordPerfectParser
application/vnd.wordperfect; version=6.x class org.apache.tika.parser.wordperfect.WordPerfectParser
Expand All @@ -121,7 +121,7 @@ application/x-7z-compressed class org.apache.tika.parser.pkg.PackageParser
application/x-ace2 class org.apache.tika.parser.gdal.GDALParser
application/x-archive class org.apache.tika.parser.pkg.PackageParser
application/x-arj class org.apache.tika.parser.pkg.PackageParser
application/x-asp class org.apache.tika.parser.html.HtmlParser
application/x-asp class org.apache.tika.parser.html.JSoupParser
application/x-bag class org.apache.tika.parser.gdal.GDALParser
application/x-blx class org.apache.tika.parser.gdal.GDALParser
application/x-bplist class org.apache.tika.parser.apple.PListParser
Expand Down Expand Up @@ -248,7 +248,7 @@ application/x-xliff+zip class org.apache.tika.parser.xliff.XLZParser
application/x-xyz class org.apache.tika.parser.gdal.GDALParser
application/x-xz class org.apache.tika.parser.pkg.CompressorParser
application/x-zmap class org.apache.tika.parser.gdal.GDALParser
application/xhtml+xml class org.apache.tika.parser.html.HtmlParser
application/xhtml+xml class org.apache.tika.parser.html.JSoupParser
application/xml class org.apache.tika.parser.xml.DcXMLParser
application/xpm class org.apache.tika.parser.gdal.GDALParser
application/zip class org.apache.tika.parser.pkg.PackageParser
Expand Down Expand Up @@ -322,7 +322,7 @@ image/x-xcf class org.apache.tika.parser.image.ImageParser
message/rfc822 class org.apache.tika.parser.mail.RFC822Parser
model/vnd.dwfx+xps class org.apache.tika.parser.microsoft.ooxml.OOXMLParser
text/csv class org.apache.tika.parser.csv.TextAndCSVParser
text/html class org.apache.tika.parser.html.HtmlParser
text/html class org.apache.tika.parser.html.JSoupParser
text/iso19139+xml class org.apache.tika.parser.geoinfo.GeographicInformationParser
text/plain class org.apache.tika.parser.csv.TextAndCSVParser
text/tsv class org.apache.tika.parser.csv.TextAndCSVParser
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.document class or
application/vnd.openxmlformats-officedocument.wordprocessingml.template class org.apache.tika.parser.microsoft.ooxml.OOXMLParser
application/vnd.sun.xml.writer class org.apache.tika.parser.odf.OpenDocumentParser
application/vnd.visio class org.apache.tika.parser.microsoft.OfficeParser
application/vnd.wap.xhtml+xml class org.apache.tika.parser.html.HtmlParser
application/vnd.wap.xhtml+xml class org.apache.tika.parser.html.JSoupParser
application/vnd.wordperfect; version=5.0 class org.apache.tika.parser.wordperfect.WordPerfectParser
application/vnd.wordperfect; version=5.1 class org.apache.tika.parser.wordperfect.WordPerfectParser
application/vnd.wordperfect; version=6.x class org.apache.tika.parser.wordperfect.WordPerfectParser
Expand All @@ -121,7 +121,7 @@ application/x-7z-compressed class org.apache.tika.parser.pkg.PackageParser
application/x-ace2 class org.apache.tika.parser.gdal.GDALParser
application/x-archive class org.apache.tika.parser.pkg.PackageParser
application/x-arj class org.apache.tika.parser.pkg.PackageParser
application/x-asp class org.apache.tika.parser.html.HtmlParser
application/x-asp class org.apache.tika.parser.html.JSoupParser
application/x-bag class org.apache.tika.parser.gdal.GDALParser
application/x-blx class org.apache.tika.parser.gdal.GDALParser
application/x-bplist class org.apache.tika.parser.apple.PListParser
Expand Down Expand Up @@ -248,7 +248,7 @@ application/x-xliff+zip class org.apache.tika.parser.xliff.XLZParser
application/x-xyz class org.apache.tika.parser.gdal.GDALParser
application/x-xz class org.apache.tika.parser.pkg.CompressorParser
application/x-zmap class org.apache.tika.parser.gdal.GDALParser
application/xhtml+xml class org.apache.tika.parser.html.HtmlParser
application/xhtml+xml class org.apache.tika.parser.html.JSoupParser
application/xml class org.apache.tika.parser.xml.DcXMLParser
application/xpm class org.apache.tika.parser.gdal.GDALParser
application/zip class org.apache.tika.parser.pkg.PackageParser
Expand Down Expand Up @@ -332,7 +332,7 @@ image/x-xcf class org.apache.tika.parser.image.ImageParser
message/rfc822 class org.apache.tika.parser.mail.RFC822Parser
model/vnd.dwfx+xps class org.apache.tika.parser.microsoft.ooxml.OOXMLParser
text/csv class org.apache.tika.parser.csv.TextAndCSVParser
text/html class org.apache.tika.parser.html.HtmlParser
text/html class org.apache.tika.parser.html.JSoupParser
text/iso19139+xml class org.apache.tika.parser.geoinfo.GeographicInformationParser
text/plain class org.apache.tika.parser.csv.TextAndCSVParser
text/tsv class org.apache.tika.parser.csv.TextAndCSVParser
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-extended</artifactId>
<version>3.0.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

<artifactId>tika-parser-tagsoup-module</artifactId>
<name>Apache Tika Tagsoup-based HTML parser module</name>


<dependencies>
<dependency>
<groupId>org.ccil.cowan.tagsoup</groupId>
<artifactId>tagsoup</artifactId>
<version>${tagsoup.version}</version>
</dependency>
<!-- needed for charset/encoding detection -->
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-parser-text-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.tika.parser.html.tagsoup;

import java.io.InputStream;
import java.util.Arrays;
import java.util.Objects;

import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;

import org.apache.tika.mime.MediaType;

public class DataURIScheme {


private final String rawMediaTypeString;
private final boolean isBase64;
private final byte[] data;

DataURIScheme(String mediaTypeString, boolean isBase64, byte[] data) {
this.rawMediaTypeString = mediaTypeString;
this.isBase64 = isBase64;
this.data = data;
}

public InputStream getInputStream() {
return new UnsynchronizedByteArrayInputStream(data);
}

/**
* @return parsed media type or <code>null</code> if parse fails or if media type string was
* not specified
*/
public MediaType getMediaType() {
if (rawMediaTypeString != null) {
return MediaType.parse(rawMediaTypeString);
}
return null;
}

public boolean isBase64() {
return isBase64;
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof DataURIScheme)) {
return false;
}
DataURIScheme that = (DataURIScheme) o;
return isBase64() == that.isBase64() &&
Objects.equals(rawMediaTypeString, that.rawMediaTypeString) &&
Arrays.equals(data, that.data);
}

@Override
public int hashCode() {

int result = Objects.hash(rawMediaTypeString, isBase64());
result = 31 * result + Arrays.hashCode(data);
return result;
}
}
Loading

0 comments on commit 5361b6d

Please sign in to comment.