diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 35d5ac7a5b..feb5208528 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -387,8 +387,8 @@ (only on the CI, not on local Windows with Docker, see comment in TIKA-4327 on 14.12.2024) expecting org.eclipse.jetty.client.util.InputStreamResponseListener which is only available in Jetty up to 11.0.26 - but this class is now in org.eclipse.jetty.client, see also - https://jetty.org/docs/jetty/12/programming-guide/migration/11-to-12.html + but this class is now in org.eclipse.jetty.client, see also + https://jetty.org/docs/jetty/12/programming-guide/migration/11-to-12.html when updating, see also TODO in PipesBiDirectionalStreamingIntegrationTest and add jakarta.servlet jakarta.servlet-api 6.0.0 to tika-server-core --> @@ -457,6 +457,11 @@ 10.8 1.5.12 3.6.3 + 4.12.0 + + 1.9.10 @@ -1144,6 +1149,36 @@ nimbus-jose-jwt ${nimbus-jose-jwt.version} + + com.squareup.okhttp3 + okhttp + ${okhttp.version} + + + com.squareup.okhttp3 + mockwebserver + ${okhttp.version} + + + org.jetbrains.kotlin + kotlin-stdlib + ${kotlin.version} + + + org.jetbrains.kotlin + kotlin-stdlib-jdk8 + ${kotlin.version} + + + org.jetbrains.kotlin + kotlin-stdlib-jdk7 + ${kotlin.version} + + + org.jetbrains.kotlin + kotlin-stdlib-common + ${kotlin.version} + diff --git a/tika-parsers/tika-parsers-extended/pom.xml b/tika-parsers/tika-parsers-extended/pom.xml index 47317aa5db..77a333417e 100644 --- a/tika-parsers/tika-parsers-extended/pom.xml +++ b/tika-parsers/tika-parsers-extended/pom.xml @@ -33,8 +33,10 @@ tika-parser-sqlite3-module tika-parser-scientific-module + tika-parser-jina-reader-module tika-parser-sqlite3-package tika-parser-scientific-package + tika-parser-jina-reader-package tika-parsers-extended-integration-tests @@ -93,4 +95,4 @@ 3.0.0-rc1 - \ No newline at end of file + diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/pom.xml b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/pom.xml new file mode 100644 index 0000000000..0977c6ed6d --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/pom.xml @@ -0,0 +1,104 @@ + + + + + tika-parsers-extended + org.apache.tika + ${revision} + + 4.0.0 + + tika-parser-jina-reader-module + Apache Tika Jina Reader parser module + + + 0.24.0 + + + + + com.squareup.okhttp3 + okhttp + ${okhttp.version} + + + com.fasterxml.jackson.core + jackson-databind + + + org.commonmark + commonmark + ${commonmark.version} + + + org.commonmark + commonmark-ext-gfm-tables + ${commonmark.version} + + + org.commonmark + commonmark-ext-gfm-strikethrough + ${commonmark.version} + + + org.slf4j + slf4j-api + + + + + com.squareup.okhttp3 + mockwebserver + ${okhttp.version} + test + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + org.apache.tika.parser.jina.reader + + + + + + org.apache.rat + apache-rat-plugin + + + src/test/resources/test-documents/** + + + + + + + + 3.0.0-rc1 + + diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderConfig.java b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderConfig.java new file mode 100644 index 0000000000..f8f7717a33 --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderConfig.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.jina; + +import java.io.Serializable; + +import org.apache.tika.exception.TikaConfigException; + +/** + * Configuration for {@link JinaReaderParser}. + *

+ * Sends PDF (base64-encoded) or HTML (raw string) content to the + * Jina Reader API and receives + * back clean markdown, which is then converted to XHTML. + */ +public class JinaReaderConfig implements Serializable { + + private static final long serialVersionUID = 1L; + + /** Jina Reader API endpoint. */ + private String baseUrl = "https://r.jina.ai/"; + + /** Bearer token for the Jina Reader API. */ + private String apiKey = ""; + + /** HTTP timeout in seconds. Jina Reader is a remote service; default is generous. */ + private int timeoutSeconds = 120; + + /** + * Response format requested from Jina Reader. + * Valid values: {@code markdown}, {@code html}, {@code text}, {@code screenshot}. + * Default is {@code markdown} since we convert it to XHTML. + */ + private String returnFormat = "markdown"; + + // ---- getters / setters ------------------------------------------------ + + public String getBaseUrl() { + return baseUrl; + } + + public void setBaseUrl(String baseUrl) throws TikaConfigException { + this.baseUrl = baseUrl; + } + + public String getApiKey() { + return apiKey; + } + + public void setApiKey(String apiKey) throws TikaConfigException { + this.apiKey = apiKey; + } + + public int getTimeoutSeconds() { + return timeoutSeconds; + } + + public void setTimeoutSeconds(int timeoutSeconds) { + this.timeoutSeconds = timeoutSeconds; + } + + public String getReturnFormat() { + return returnFormat; + } + + public void setReturnFormat(String returnFormat) { + this.returnFormat = returnFormat; + } +} diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderParser.java b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderParser.java new file mode 100644 index 0000000000..12f3efb97b --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderParser.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.jina; + +import static org.apache.tika.sax.XHTMLContentHandler.XHTML; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Base64; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import okhttp3.MediaType; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import okhttp3.RequestBody; +import okhttp3.Response; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.config.ConfigDeserializer; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.JsonConfig; +import org.apache.tika.config.TikaComponent; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.StringUtils; + +/** + * Parser that sends document content to the + * Jina Reader API for clean-text + * extraction and returns the result as XHTML. + *

+ * Supported types: + *

+ *

+ * Authentication: set {@code apiKey} in the config; it is sent as a + * {@code Authorization: Bearer } header. + *

+ * Configuration key: {@code "jina-reader-parser"} + * + * @since Apache Tika 4.0 + */ +@TikaComponent(name = "jina-reader-parser") +public class JinaReaderParser implements Parser, Initializable, Closeable { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(JinaReaderParser.class); + + private static final MediaType JSON_MEDIA_TYPE = + MediaType.parse("application/json; charset=utf-8"); + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private static final Set SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<>(Arrays.asList( + org.apache.tika.mime.MediaType.application("pdf"), + org.apache.tika.mime.MediaType.text("html") + ))); + + private final JinaReaderConfig config; + private transient OkHttpClient httpClient; + + public JinaReaderParser() { + this(new JinaReaderConfig()); + } + + public JinaReaderParser(JinaReaderConfig config) { + this.config = config; + buildHttpClient(); + } + + public JinaReaderParser(JsonConfig jsonConfig) { + this(ConfigDeserializer.buildConfig(jsonConfig, JinaReaderConfig.class)); + } + + // ---- Parser ----------------------------------------------------------- + + @Override + public Set getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + + JinaReaderConfig cfg = context.get(JinaReaderConfig.class, config); + + String contentType = metadata.get(Metadata.CONTENT_TYPE); + boolean isPdf = contentType != null && contentType.startsWith("application/pdf"); + + String requestJson = buildRequestJson(tis, isPdf); + + String markdown = callJinaApi(cfg, requestJson); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.startElement(XHTML, "div", "div", new org.xml.sax.helpers.AttributesImpl()); + MarkdownToXHTMLEmitter.emit(markdown, xhtml); + xhtml.endElement(XHTML, "div", "div"); + xhtml.endDocument(); + } + + // ---- Initializable ---------------------------------------------------- + + @Override + public void initialize() { + buildHttpClient(); + } + + // ---- Closeable -------------------------------------------------------- + + @Override + public void close() { + if (httpClient != null) { + httpClient.dispatcher().executorService().shutdown(); + httpClient.connectionPool().evictAll(); + } + } + + // ---- helpers ---------------------------------------------------------- + + String buildRequestJson(TikaInputStream tis, boolean isPdf) throws IOException { + ObjectNode root = MAPPER.createObjectNode(); + if (isPdf) { + byte[] bytes = tis.readAllBytes(); + root.put("pdf", Base64.getEncoder().encodeToString(bytes)); + } else { + String html = new String(tis.readAllBytes(), StandardCharsets.UTF_8); + root.put("html", html); + } + return root.toString(); + } + + private String callJinaApi(JinaReaderConfig cfg, String requestJson) throws TikaException { + Request.Builder builder = new Request.Builder() + .url(cfg.getBaseUrl()) + .post(RequestBody.create(requestJson, JSON_MEDIA_TYPE)) + .header("Content-Type", "application/json") + .header("Accept", "application/json") + .header("X-Return-Format", cfg.getReturnFormat()); + + if (!StringUtils.isBlank(cfg.getApiKey())) { + builder.header("Authorization", "Bearer " + cfg.getApiKey()); + } + + Request request = builder.build(); + try (Response response = httpClient.newCall(request).execute()) { + if (!response.isSuccessful()) { + String body = response.body() != null ? response.body().string() : ""; + throw new TikaException( + "Jina Reader API request failed with HTTP " + + response.code() + ": " + body); + } + String responseBody = response.body() != null ? response.body().string() : ""; + return extractContent(responseBody); + } catch (IOException e) { + throw new TikaException("Jina Reader API request failed: " + e.getMessage(), e); + } + } + + String extractContent(String responseBody) throws TikaException { + try { + JsonNode root = MAPPER.readTree(responseBody); + JsonNode data = root.get("data"); + if (data == null) { + throw new TikaException( + "Jina Reader API response missing 'data' field: " + responseBody); + } + JsonNode content = data.get("content"); + if (content == null || content.isNull()) { + return ""; + } + return content.asText(); + } catch (IOException e) { + throw new TikaException( + "Failed to parse Jina Reader API response: " + e.getMessage(), e); + } + } + + private void buildHttpClient() { + httpClient = new OkHttpClient.Builder() + .connectTimeout(30, TimeUnit.SECONDS) + .readTimeout(config.getTimeoutSeconds(), TimeUnit.SECONDS) + .writeTimeout(60, TimeUnit.SECONDS) + .build(); + } + + // ---- config getters/setters for XML/JSON config wiring ---------------- + + public String getBaseUrl() { + return config.getBaseUrl(); + } + + public void setBaseUrl(String baseUrl) throws org.apache.tika.exception.TikaConfigException { + config.setBaseUrl(baseUrl); + } + + public String getApiKey() { + return config.getApiKey(); + } + + public void setApiKey(String apiKey) throws org.apache.tika.exception.TikaConfigException { + config.setApiKey(apiKey); + } + + public int getTimeoutSeconds() { + return config.getTimeoutSeconds(); + } + + public void setTimeoutSeconds(int timeoutSeconds) { + config.setTimeoutSeconds(timeoutSeconds); + } + + public String getReturnFormat() { + return config.getReturnFormat(); + } + + public void setReturnFormat(String returnFormat) { + config.setReturnFormat(returnFormat); + } + + // package-visible for tests + JinaReaderConfig getConfig() { + return config; + } +} diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/MarkdownToXHTMLEmitter.java b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/MarkdownToXHTMLEmitter.java new file mode 100644 index 0000000000..da5b84a1e5 --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/MarkdownToXHTMLEmitter.java @@ -0,0 +1,409 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.jina; + +import java.util.Arrays; +import java.util.List; + +import org.commonmark.Extension; +import org.commonmark.ext.gfm.strikethrough.Strikethrough; +import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension; +import org.commonmark.ext.gfm.tables.TableBlock; +import org.commonmark.ext.gfm.tables.TableBody; +import org.commonmark.ext.gfm.tables.TableCell; +import org.commonmark.ext.gfm.tables.TableHead; +import org.commonmark.ext.gfm.tables.TableRow; +import org.commonmark.ext.gfm.tables.TablesExtension; +import org.commonmark.node.AbstractVisitor; +import org.commonmark.node.BlockQuote; +import org.commonmark.node.BulletList; +import org.commonmark.node.Code; +import org.commonmark.node.Document; +import org.commonmark.node.Emphasis; +import org.commonmark.node.FencedCodeBlock; +import org.commonmark.node.HardLineBreak; +import org.commonmark.node.Heading; +import org.commonmark.node.HtmlBlock; +import org.commonmark.node.HtmlInline; +import org.commonmark.node.Image; +import org.commonmark.node.IndentedCodeBlock; +import org.commonmark.node.Link; +import org.commonmark.node.ListItem; +import org.commonmark.node.Node; +import org.commonmark.node.OrderedList; +import org.commonmark.node.Paragraph; +import org.commonmark.node.SoftLineBreak; +import org.commonmark.node.StrongEmphasis; +import org.commonmark.node.Text; +import org.commonmark.node.ThematicBreak; +import org.commonmark.parser.Parser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Parses a markdown string using commonmark-java and emits XHTML SAX events. + *

+ * Supports: + *

+ * + * @since Apache Tika 4.0 + */ +class MarkdownToXHTMLEmitter { + + private static final List EXTENSIONS = Arrays.asList( + TablesExtension.create(), + StrikethroughExtension.create() + ); + + private static final Parser PARSER = Parser.builder() + .extensions(EXTENSIONS) + .build(); + + private static final AttributesImpl EMPTY_ATTRS = new AttributesImpl(); + + /** + * Parses the given markdown text and emits SAX events to the handler. + *

+ * The caller is responsible for calling {@code startDocument} / + * {@code endDocument} on the handler if desired — this method only emits + * the body-level elements. + * + * @param markdown the markdown text to parse + * @param handler the SAX content handler to receive events + * @throws SAXException if the handler throws + */ + static void emit(String markdown, ContentHandler handler) throws SAXException { + if (markdown == null || markdown.isEmpty()) { + return; + } + Node document = PARSER.parse(markdown); + SAXVisitor visitor = new SAXVisitor(handler); + document.accept(visitor); + if (visitor.saxException != null) { + throw visitor.saxException; + } + } + + /** + * commonmark AST visitor that fires SAX events for each node. + */ + private static class SAXVisitor extends AbstractVisitor { + + private final ContentHandler handler; + SAXException saxException; + + SAXVisitor(ContentHandler handler) { + this.handler = handler; + } + + // --- block nodes --- + + @Override + public void visit(Document document) { + visitChildren(document); + } + + @Override + public void visit(Heading heading) { + String tag = "h" + heading.getLevel(); + startElement(tag); + visitChildren(heading); + endElement(tag); + } + + @Override + public void visit(Paragraph paragraph) { + // Skip wrapping

inside list items — commonmark wraps + // "loose" list item content in Paragraph nodes, which would + // produce

  • text

  • . We emit the text directly. + if (paragraph.getParent() instanceof ListItem) { + visitChildren(paragraph); + return; + } + startElement("p"); + visitChildren(paragraph); + endElement("p"); + } + + @Override + public void visit(BlockQuote blockQuote) { + startElement("blockquote"); + visitChildren(blockQuote); + endElement("blockquote"); + } + + @Override + public void visit(BulletList bulletList) { + startElement("ul"); + visitChildren(bulletList); + endElement("ul"); + } + + @Override + public void visit(OrderedList orderedList) { + startElement("ol"); + visitChildren(orderedList); + endElement("ol"); + } + + @Override + public void visit(ListItem listItem) { + startElement("li"); + visitChildren(listItem); + endElement("li"); + } + + @Override + public void visit(FencedCodeBlock fencedCodeBlock) { + AttributesImpl attrs = EMPTY_ATTRS; + String info = fencedCodeBlock.getInfo(); + if (info != null && !info.isEmpty()) { + attrs = new AttributesImpl(); + attrs.addAttribute("", "class", "class", "CDATA", + "language-" + info.split("\\s+")[0]); + } + startElement("pre"); + startElement("code", attrs); + characters(fencedCodeBlock.getLiteral()); + endElement("code"); + endElement("pre"); + } + + @Override + public void visit(IndentedCodeBlock indentedCodeBlock) { + startElement("pre"); + startElement("code"); + characters(indentedCodeBlock.getLiteral()); + endElement("code"); + endElement("pre"); + } + + @Override + public void visit(ThematicBreak thematicBreak) { + emptyElement("hr"); + } + + @Override + public void visit(HtmlBlock htmlBlock) { + // Emit raw HTML content as plain text — we don't parse nested HTML + characters(htmlBlock.getLiteral()); + } + + // --- inline nodes --- + + @Override + public void visit(Text text) { + characters(text.getLiteral()); + } + + @Override + public void visit(StrongEmphasis strongEmphasis) { + startElement("b"); + visitChildren(strongEmphasis); + endElement("b"); + } + + @Override + public void visit(Emphasis emphasis) { + startElement("i"); + visitChildren(emphasis); + endElement("i"); + } + + @Override + public void visit(Code code) { + startElement("code"); + characters(code.getLiteral()); + endElement("code"); + } + + @Override + public void visit(Link link) { + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", "href", "href", "CDATA", link.getDestination()); + if (link.getTitle() != null && !link.getTitle().isEmpty()) { + attrs.addAttribute("", "title", "title", "CDATA", link.getTitle()); + } + startElement("a", attrs); + visitChildren(link); + endElement("a"); + } + + @Override + public void visit(Image image) { + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", "src", "src", "CDATA", image.getDestination()); + if (image.getTitle() != null && !image.getTitle().isEmpty()) { + attrs.addAttribute("", "title", "title", "CDATA", image.getTitle()); + } + // Use alt text from child text nodes + StringBuilder alt = new StringBuilder(); + Node child = image.getFirstChild(); + while (child != null) { + if (child instanceof Text) { + alt.append(((Text) child).getLiteral()); + } + child = child.getNext(); + } + attrs.addAttribute("", "alt", "alt", "CDATA", alt.toString()); + emptyElement("img", attrs); + } + + @Override + public void visit(HardLineBreak hardLineBreak) { + emptyElement("br"); + } + + @Override + public void visit(SoftLineBreak softLineBreak) { + characters(" "); + } + + @Override + public void visit(HtmlInline htmlInline) { + // Emit inline HTML as plain text + characters(htmlInline.getLiteral()); + } + + // --- GFM extensions --- + + @Override + public void visit(org.commonmark.node.CustomBlock customBlock) { + if (customBlock instanceof TableBlock) { + startElement("table"); + visitChildren(customBlock); + endElement("table"); + } else { + visitChildren(customBlock); + } + } + + @Override + public void visit(org.commonmark.node.CustomNode customNode) { + if (customNode instanceof TableHead) { + startElement("thead"); + visitChildren(customNode); + endElement("thead"); + } else if (customNode instanceof TableBody) { + startElement("tbody"); + visitChildren(customNode); + endElement("tbody"); + } else if (customNode instanceof TableRow) { + startElement("tr"); + visitChildren(customNode); + endElement("tr"); + } else if (customNode instanceof TableCell) { + TableCell cell = (TableCell) customNode; + String tag = cell.isHeader() ? "th" : "td"; + AttributesImpl attrs = EMPTY_ATTRS; + TableCell.Alignment alignment = cell.getAlignment(); + if (alignment != null) { + attrs = new AttributesImpl(); + String align; + switch (alignment) { + case LEFT: + align = "left"; + break; + case CENTER: + align = "center"; + break; + case RIGHT: + align = "right"; + break; + default: + align = null; + break; + } + if (align != null) { + attrs.addAttribute("", "align", "align", "CDATA", align); + } + } + startElement(tag, attrs); + visitChildren(customNode); + endElement(tag); + } else if (customNode instanceof Strikethrough) { + startElement("s"); + visitChildren(customNode); + endElement("s"); + } else { + visitChildren(customNode); + } + } + + // --- SAX helpers --- + + private void startElement(String localName) { + startElement(localName, EMPTY_ATTRS); + } + + private void startElement(String localName, AttributesImpl attrs) { + if (saxException != null) { + return; + } + try { + handler.startElement("", localName, localName, attrs); + } catch (SAXException e) { + saxException = e; + } + } + + private void endElement(String localName) { + if (saxException != null) { + return; + } + try { + handler.endElement("", localName, localName); + } catch (SAXException e) { + saxException = e; + } + } + + private void emptyElement(String localName) { + emptyElement(localName, EMPTY_ATTRS); + } + + private void emptyElement(String localName, AttributesImpl attrs) { + startElement(localName, attrs); + endElement(localName); + } + + private void characters(String text) { + if (saxException != null || text == null || text.isEmpty()) { + return; + } + try { + char[] chars = text.toCharArray(); + handler.characters(chars, 0, chars.length); + } catch (SAXException e) { + saxException = e; + } + } + } +} diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/test/java/org/apache/tika/parser/jina/JinaReaderParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/test/java/org/apache/tika/parser/jina/JinaReaderParserTest.java new file mode 100644 index 0000000000..43d6ded190 --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/test/java/org/apache/tika/parser/jina/JinaReaderParserTest.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.jina; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.util.Base64; +import java.util.Set; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import okhttp3.mockwebserver.MockResponse; +import okhttp3.mockwebserver.MockWebServer; +import okhttp3.mockwebserver.RecordedRequest; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; + +public class JinaReaderParserTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private MockWebServer server; + private JinaReaderParser parser; + private JinaReaderConfig config; + + @BeforeEach + void setUp() throws Exception { + server = new MockWebServer(); + server.start(); + + config = new JinaReaderConfig(); + config.setBaseUrl(server.url("/").toString()); + config.setApiKey("test-key"); + config.setTimeoutSeconds(10); + + parser = new JinaReaderParser(config); + } + + @AfterEach + void tearDown() throws Exception { + server.shutdown(); + parser.close(); + } + + @Test + void testPdfParsing() throws Exception { + String markdown = "# My PDF Title\n\nSome paragraph text."; + server.enqueue(new MockResponse() + .setBody(buildJinaResponse(markdown)) + .setHeader("Content-Type", "application/json")); + + byte[] fakePdf = "%PDF-1.4 fake pdf content".getBytes(StandardCharsets.UTF_8); + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); + + BodyContentHandler handler = new BodyContentHandler(); + try (TikaInputStream tis = TikaInputStream.get(new ByteArrayInputStream(fakePdf))) { + parser.parse(tis, handler, metadata, new ParseContext()); + } + + assertTrue(handler.toString().contains("My PDF Title")); + assertTrue(handler.toString().contains("Some paragraph text.")); + + RecordedRequest request = server.takeRequest(); + assertEquals("POST", request.getMethod()); + assertEquals("Bearer test-key", request.getHeader("Authorization")); + assertEquals("markdown", request.getHeader("X-Return-Format")); + + JsonNode body = MAPPER.readTree(request.getBody().readUtf8()); + assertTrue(body.has("pdf"), "Request should have 'pdf' field"); + String decoded = new String(Base64.getDecoder().decode(body.get("pdf").asText()), + StandardCharsets.UTF_8); + assertTrue(decoded.startsWith("%PDF")); + } + + @Test + void testHtmlParsing() throws Exception { + String markdown = "## Article Heading\n\nClean content here."; + server.enqueue(new MockResponse() + .setBody(buildJinaResponse(markdown)) + .setHeader("Content-Type", "application/json")); + + String html = "

    Article

    " + + "

    Content

    "; + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html"); + + BodyContentHandler handler = new BodyContentHandler(); + try (TikaInputStream tis = TikaInputStream.get( + new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)))) { + parser.parse(tis, handler, metadata, new ParseContext()); + } + + assertTrue(handler.toString().contains("Article Heading")); + + RecordedRequest request = server.takeRequest(); + JsonNode body = MAPPER.readTree(request.getBody().readUtf8()); + assertTrue(body.has("html"), "Request should have 'html' field"); + assertTrue(body.get("html").asText().contains("")); + } + + @Test + void testApiError() throws Exception { + server.enqueue(new MockResponse() + .setResponseCode(400) + .setBody("{\"error\":\"No URL provided\"}")); + + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); + + assertThrows(TikaException.class, () -> { + try (TikaInputStream tis = TikaInputStream.get( + new ByteArrayInputStream(new byte[]{1, 2, 3}))) { + parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext()); + } + }); + } + + @Test + void testExtractContent() throws TikaException { + String response = buildJinaResponse("Hello **world**"); + String content = parser.extractContent(response); + assertEquals("Hello **world**", content); + } + + @Test + void testExtractContentMissingData() { + assertThrows(TikaException.class, + () -> parser.extractContent("{\"code\":200}")); + } + + @Test + void testSupportedTypes() { + Set types = + parser.getSupportedTypes(new ParseContext()); + assertTrue(types.stream().anyMatch(mt -> mt.toString().equals("application/pdf"))); + assertTrue(types.stream().anyMatch(mt -> mt.toString().equals("text/html"))); + } + + @Test + void testNoApiKeyHeader() throws Exception { + config.setApiKey(""); + parser = new JinaReaderParser(config); + + server.enqueue(new MockResponse() + .setBody(buildJinaResponse("content")) + .setHeader("Content-Type", "application/json")); + + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html"); + + try (TikaInputStream tis = TikaInputStream.get( + new ByteArrayInputStream("".getBytes(StandardCharsets.UTF_8)))) { + parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext()); + } + + RecordedRequest request = server.takeRequest(); + assertTrue(request.getHeader("Authorization") == null + || request.getHeader("Authorization").isEmpty(), + "No auth header expected when apiKey is blank"); + } + + private String buildJinaResponse(String content) { + String escaped = content.replace("\\", "\\\\").replace("\"", "\\\"") + .replace("\n", "\\n"); + return "{\"code\":200,\"data\":{\"content\":\"" + escaped + "\"}}"; + } +} diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-package/pom.xml b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-package/pom.xml new file mode 100644 index 0000000000..c27cb7deb5 --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-package/pom.xml @@ -0,0 +1,109 @@ + + + + + tika-parsers-extended + org.apache.tika + ${revision} + + 4.0.0 + + tika-parser-jina-reader-package + Apache Tika Jina Reader parser package + + + + ${project.groupId} + tika-parser-jina-reader-module + ${project.version} + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + org.apache.tika.parser.jina.reader + + + + + + org.apache.maven.plugins + maven-shade-plugin + ${maven.shade.version} + + + package + + shade + + + false + + + *:* + + module-info.class + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + META-INF/DEPENDENCIES + META-INF/MANIFEST.MF + META-INF/LICENSE.md + META-INF/NOTICE.md + + + + + + false + + + + + META-INF/LICENSE + target/classes/META-INF/LICENSE + + + + + + + + org.apache.rat + apache-rat-plugin + + + dependency-reduced-pom.xml + + + + + + + + 3.0.0-rc1 + +