diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 35d5ac7a5b..feb5208528 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -387,8 +387,8 @@
(only on the CI, not on local Windows with Docker, see comment in TIKA-4327 on 14.12.2024)
expecting org.eclipse.jetty.client.util.InputStreamResponseListener which is only available
in Jetty up to 11.0.26
- but this class is now in org.eclipse.jetty.client, see also
- https://jetty.org/docs/jetty/12/programming-guide/migration/11-to-12.html
+ but this class is now in org.eclipse.jetty.client, see also
+ https://jetty.org/docs/jetty/12/programming-guide/migration/11-to-12.html
when updating, see also TODO in PipesBiDirectionalStreamingIntegrationTest
and add jakarta.servlet jakarta.servlet-api 6.0.0 to tika-server-core
-->
@@ -457,6 +457,11 @@
10.81.5.123.6.3
+ 4.12.0
+
+ 1.9.10
@@ -1144,6 +1149,36 @@
nimbus-jose-jwt${nimbus-jose-jwt.version}
+
+ com.squareup.okhttp3
+ okhttp
+ ${okhttp.version}
+
+
+ com.squareup.okhttp3
+ mockwebserver
+ ${okhttp.version}
+
+
+ org.jetbrains.kotlin
+ kotlin-stdlib
+ ${kotlin.version}
+
+
+ org.jetbrains.kotlin
+ kotlin-stdlib-jdk8
+ ${kotlin.version}
+
+
+ org.jetbrains.kotlin
+ kotlin-stdlib-jdk7
+ ${kotlin.version}
+
+
+ org.jetbrains.kotlin
+ kotlin-stdlib-common
+ ${kotlin.version}
+
diff --git a/tika-parsers/tika-parsers-extended/pom.xml b/tika-parsers/tika-parsers-extended/pom.xml
index 47317aa5db..77a333417e 100644
--- a/tika-parsers/tika-parsers-extended/pom.xml
+++ b/tika-parsers/tika-parsers-extended/pom.xml
@@ -33,8 +33,10 @@
tika-parser-sqlite3-moduletika-parser-scientific-module
+ tika-parser-jina-reader-moduletika-parser-sqlite3-packagetika-parser-scientific-package
+ tika-parser-jina-reader-packagetika-parsers-extended-integration-tests
@@ -93,4 +95,4 @@
3.0.0-rc1
-
\ No newline at end of file
+
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/pom.xml b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/pom.xml
new file mode 100644
index 0000000000..0977c6ed6d
--- /dev/null
+++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/pom.xml
@@ -0,0 +1,104 @@
+
+
+
+
+ tika-parsers-extended
+ org.apache.tika
+ ${revision}
+
+ 4.0.0
+
+ tika-parser-jina-reader-module
+ Apache Tika Jina Reader parser module
+
+
+ 0.24.0
+
+
+
+
+ com.squareup.okhttp3
+ okhttp
+ ${okhttp.version}
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+
+
+ org.commonmark
+ commonmark
+ ${commonmark.version}
+
+
+ org.commonmark
+ commonmark-ext-gfm-tables
+ ${commonmark.version}
+
+
+ org.commonmark
+ commonmark-ext-gfm-strikethrough
+ ${commonmark.version}
+
+
+ org.slf4j
+ slf4j-api
+
+
+
+
+ com.squareup.okhttp3
+ mockwebserver
+ ${okhttp.version}
+ test
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+
+ org.apache.tika.parser.jina.reader
+
+
+
+
+
+ org.apache.rat
+ apache-rat-plugin
+
+
+ src/test/resources/test-documents/**
+
+
+
+
+
+
+
+ 3.0.0-rc1
+
+
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderConfig.java b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderConfig.java
new file mode 100644
index 0000000000..f8f7717a33
--- /dev/null
+++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderConfig.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.jina;
+
+import java.io.Serializable;
+
+import org.apache.tika.exception.TikaConfigException;
+
+/**
+ * Configuration for {@link JinaReaderParser}.
+ *
+ * Sends PDF (base64-encoded) or HTML (raw string) content to the
+ * Jina Reader API and receives
+ * back clean markdown, which is then converted to XHTML.
+ */
+public class JinaReaderConfig implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ /** Jina Reader API endpoint. */
+ private String baseUrl = "https://r.jina.ai/";
+
+ /** Bearer token for the Jina Reader API. */
+ private String apiKey = "";
+
+ /** HTTP timeout in seconds. Jina Reader is a remote service; default is generous. */
+ private int timeoutSeconds = 120;
+
+ /**
+ * Response format requested from Jina Reader.
+ * Valid values: {@code markdown}, {@code html}, {@code text}, {@code screenshot}.
+ * Default is {@code markdown} since we convert it to XHTML.
+ */
+ private String returnFormat = "markdown";
+
+ // ---- getters / setters ------------------------------------------------
+
+ public String getBaseUrl() {
+ return baseUrl;
+ }
+
+ public void setBaseUrl(String baseUrl) throws TikaConfigException {
+ this.baseUrl = baseUrl;
+ }
+
+ public String getApiKey() {
+ return apiKey;
+ }
+
+ public void setApiKey(String apiKey) throws TikaConfigException {
+ this.apiKey = apiKey;
+ }
+
+ public int getTimeoutSeconds() {
+ return timeoutSeconds;
+ }
+
+ public void setTimeoutSeconds(int timeoutSeconds) {
+ this.timeoutSeconds = timeoutSeconds;
+ }
+
+ public String getReturnFormat() {
+ return returnFormat;
+ }
+
+ public void setReturnFormat(String returnFormat) {
+ this.returnFormat = returnFormat;
+ }
+}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderParser.java b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderParser.java
new file mode 100644
index 0000000000..12f3efb97b
--- /dev/null
+++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderParser.java
@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.jina;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Base64;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import okhttp3.MediaType;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import okhttp3.RequestBody;
+import okhttp3.Response;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.ConfigDeserializer;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.JsonConfig;
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * Parser that sends document content to the
+ * Jina Reader API for clean-text
+ * extraction and returns the result as XHTML.
+ *
+ * Supported types:
+ *
+ *
{@code application/pdf} — bytes are base64-encoded and sent as
+ * {@code {"pdf": ""}}
+ *
{@code text/html} — raw HTML string sent as
+ * {@code {"html": "..."}}
+ *
+ *
+ * Authentication: set {@code apiKey} in the config; it is sent as a
+ * {@code Authorization: Bearer } header.
+ *
+ * Configuration key: {@code "jina-reader-parser"}
+ *
+ * @since Apache Tika 4.0
+ */
+@TikaComponent(name = "jina-reader-parser")
+public class JinaReaderParser implements Parser, Initializable, Closeable {
+
+ private static final long serialVersionUID = 1L;
+
+ private static final Logger LOG = LoggerFactory.getLogger(JinaReaderParser.class);
+
+ private static final MediaType JSON_MEDIA_TYPE =
+ MediaType.parse("application/json; charset=utf-8");
+
+ private static final ObjectMapper MAPPER = new ObjectMapper();
+
+ private static final Set SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+ org.apache.tika.mime.MediaType.application("pdf"),
+ org.apache.tika.mime.MediaType.text("html")
+ )));
+
+ private final JinaReaderConfig config;
+ private transient OkHttpClient httpClient;
+
+ public JinaReaderParser() {
+ this(new JinaReaderConfig());
+ }
+
+ public JinaReaderParser(JinaReaderConfig config) {
+ this.config = config;
+ buildHttpClient();
+ }
+
+ public JinaReaderParser(JsonConfig jsonConfig) {
+ this(ConfigDeserializer.buildConfig(jsonConfig, JinaReaderConfig.class));
+ }
+
+ // ---- Parser -----------------------------------------------------------
+
+ @Override
+ public Set getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+
+ JinaReaderConfig cfg = context.get(JinaReaderConfig.class, config);
+
+ String contentType = metadata.get(Metadata.CONTENT_TYPE);
+ boolean isPdf = contentType != null && contentType.startsWith("application/pdf");
+
+ String requestJson = buildRequestJson(tis, isPdf);
+
+ String markdown = callJinaApi(cfg, requestJson);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.startElement(XHTML, "div", "div", new org.xml.sax.helpers.AttributesImpl());
+ MarkdownToXHTMLEmitter.emit(markdown, xhtml);
+ xhtml.endElement(XHTML, "div", "div");
+ xhtml.endDocument();
+ }
+
+ // ---- Initializable ----------------------------------------------------
+
+ @Override
+ public void initialize() {
+ buildHttpClient();
+ }
+
+ // ---- Closeable --------------------------------------------------------
+
+ @Override
+ public void close() {
+ if (httpClient != null) {
+ httpClient.dispatcher().executorService().shutdown();
+ httpClient.connectionPool().evictAll();
+ }
+ }
+
+ // ---- helpers ----------------------------------------------------------
+
+ String buildRequestJson(TikaInputStream tis, boolean isPdf) throws IOException {
+ ObjectNode root = MAPPER.createObjectNode();
+ if (isPdf) {
+ byte[] bytes = tis.readAllBytes();
+ root.put("pdf", Base64.getEncoder().encodeToString(bytes));
+ } else {
+ String html = new String(tis.readAllBytes(), StandardCharsets.UTF_8);
+ root.put("html", html);
+ }
+ return root.toString();
+ }
+
+ private String callJinaApi(JinaReaderConfig cfg, String requestJson) throws TikaException {
+ Request.Builder builder = new Request.Builder()
+ .url(cfg.getBaseUrl())
+ .post(RequestBody.create(requestJson, JSON_MEDIA_TYPE))
+ .header("Content-Type", "application/json")
+ .header("Accept", "application/json")
+ .header("X-Return-Format", cfg.getReturnFormat());
+
+ if (!StringUtils.isBlank(cfg.getApiKey())) {
+ builder.header("Authorization", "Bearer " + cfg.getApiKey());
+ }
+
+ Request request = builder.build();
+ try (Response response = httpClient.newCall(request).execute()) {
+ if (!response.isSuccessful()) {
+ String body = response.body() != null ? response.body().string() : "";
+ throw new TikaException(
+ "Jina Reader API request failed with HTTP "
+ + response.code() + ": " + body);
+ }
+ String responseBody = response.body() != null ? response.body().string() : "";
+ return extractContent(responseBody);
+ } catch (IOException e) {
+ throw new TikaException("Jina Reader API request failed: " + e.getMessage(), e);
+ }
+ }
+
+ String extractContent(String responseBody) throws TikaException {
+ try {
+ JsonNode root = MAPPER.readTree(responseBody);
+ JsonNode data = root.get("data");
+ if (data == null) {
+ throw new TikaException(
+ "Jina Reader API response missing 'data' field: " + responseBody);
+ }
+ JsonNode content = data.get("content");
+ if (content == null || content.isNull()) {
+ return "";
+ }
+ return content.asText();
+ } catch (IOException e) {
+ throw new TikaException(
+ "Failed to parse Jina Reader API response: " + e.getMessage(), e);
+ }
+ }
+
+ private void buildHttpClient() {
+ httpClient = new OkHttpClient.Builder()
+ .connectTimeout(30, TimeUnit.SECONDS)
+ .readTimeout(config.getTimeoutSeconds(), TimeUnit.SECONDS)
+ .writeTimeout(60, TimeUnit.SECONDS)
+ .build();
+ }
+
+ // ---- config getters/setters for XML/JSON config wiring ----------------
+
+ public String getBaseUrl() {
+ return config.getBaseUrl();
+ }
+
+ public void setBaseUrl(String baseUrl) throws org.apache.tika.exception.TikaConfigException {
+ config.setBaseUrl(baseUrl);
+ }
+
+ public String getApiKey() {
+ return config.getApiKey();
+ }
+
+ public void setApiKey(String apiKey) throws org.apache.tika.exception.TikaConfigException {
+ config.setApiKey(apiKey);
+ }
+
+ public int getTimeoutSeconds() {
+ return config.getTimeoutSeconds();
+ }
+
+ public void setTimeoutSeconds(int timeoutSeconds) {
+ config.setTimeoutSeconds(timeoutSeconds);
+ }
+
+ public String getReturnFormat() {
+ return config.getReturnFormat();
+ }
+
+ public void setReturnFormat(String returnFormat) {
+ config.setReturnFormat(returnFormat);
+ }
+
+ // package-visible for tests
+ JinaReaderConfig getConfig() {
+ return config;
+ }
+}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/MarkdownToXHTMLEmitter.java b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/MarkdownToXHTMLEmitter.java
new file mode 100644
index 0000000000..da5b84a1e5
--- /dev/null
+++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/MarkdownToXHTMLEmitter.java
@@ -0,0 +1,409 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.jina;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.commonmark.Extension;
+import org.commonmark.ext.gfm.strikethrough.Strikethrough;
+import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension;
+import org.commonmark.ext.gfm.tables.TableBlock;
+import org.commonmark.ext.gfm.tables.TableBody;
+import org.commonmark.ext.gfm.tables.TableCell;
+import org.commonmark.ext.gfm.tables.TableHead;
+import org.commonmark.ext.gfm.tables.TableRow;
+import org.commonmark.ext.gfm.tables.TablesExtension;
+import org.commonmark.node.AbstractVisitor;
+import org.commonmark.node.BlockQuote;
+import org.commonmark.node.BulletList;
+import org.commonmark.node.Code;
+import org.commonmark.node.Document;
+import org.commonmark.node.Emphasis;
+import org.commonmark.node.FencedCodeBlock;
+import org.commonmark.node.HardLineBreak;
+import org.commonmark.node.Heading;
+import org.commonmark.node.HtmlBlock;
+import org.commonmark.node.HtmlInline;
+import org.commonmark.node.Image;
+import org.commonmark.node.IndentedCodeBlock;
+import org.commonmark.node.Link;
+import org.commonmark.node.ListItem;
+import org.commonmark.node.Node;
+import org.commonmark.node.OrderedList;
+import org.commonmark.node.Paragraph;
+import org.commonmark.node.SoftLineBreak;
+import org.commonmark.node.StrongEmphasis;
+import org.commonmark.node.Text;
+import org.commonmark.node.ThematicBreak;
+import org.commonmark.parser.Parser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parses a markdown string using commonmark-java and emits XHTML SAX events.
+ *