Skip to content

Commit 7dabca9

Browse files
authored
* TIKA-3109 -- parse iframe's srcdoc as an embedded document
1 parent 0ed9f8a commit 7dabca9

File tree

4 files changed

+89
-0
lines changed

4 files changed

+89
-0
lines changed

CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ Release 2.8.1 - ???
77

88
* Fix bug that led to duplicate extraction of macros from some OLE2 containers (TIKA-4116).
99

10+
* Parse iframe's srcdoc as an embedded file (TIKA-3109).
11+
1012
* Add detection of warc.gz as a specialization of gz and parse as if a standard WARC (TIKA-4048).
1113

1214
* Allow users to modify the attachment limit size in the /unpack resource (TIKA-4039)

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,12 @@ public void startElement(String uri, String local, String name, Attributes atts)
187187
handleDataURIScheme(value);
188188
}
189189
}
190+
if ("IFRAME".equals(name)) {
191+
String srcDoc = atts.getValue("srcdoc");
192+
if (!StringUtils.isBlank(srcDoc)) {
193+
handleSrcDoc(srcDoc);
194+
}
195+
}
190196
}
191197

192198
/**
@@ -339,6 +345,22 @@ public void endElement(String uri, String local, String name) throws SAXExceptio
339345
discardLevel--;
340346
}
341347
}
348+
private void handleSrcDoc(String string) throws SAXException {
349+
Metadata m = new Metadata();
350+
m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
351+
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
352+
m.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, "text/html");
353+
//TODO add metadata about iframe content?
354+
EmbeddedDocumentExtractor embeddedDocumentExtractor =
355+
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
356+
if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
357+
try (InputStream stream = new UnsynchronizedByteArrayInputStream(string.getBytes(StandardCharsets.UTF_8))) {
358+
embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m, true);
359+
} catch (IOException e) {
360+
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
361+
}
362+
}
363+
}
342364

343365
private void handleDataURIScheme(String string) throws SAXException {
344366
DataURIScheme dataURIScheme = null;
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.tika.parser.html;
19+
20+
import static org.junit.jupiter.api.Assertions.assertEquals;
21+
22+
import java.util.List;
23+
24+
import org.junit.jupiter.api.Test;
25+
26+
import org.apache.tika.TikaTest;
27+
import org.apache.tika.metadata.Metadata;
28+
import org.apache.tika.metadata.TikaCoreProperties;
29+
30+
public class SrcDocTest extends TikaTest {
31+
32+
33+
@Test
34+
public void testBasic() throws Exception {
35+
List<Metadata> metadataList = getRecursiveMetadata("testSrcDoc.html");
36+
assertEquals(2, metadataList.size());
37+
assertContains("outside", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
38+
assertContains("this is the iframe content",
39+
metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
40+
assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString(),
41+
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
42+
}
43+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<!--
2+
Licensed to the Apache Software Foundation (ASF) under one or more
3+
contributor license agreements. See the NOTICE file distributed with
4+
this work for additional information regarding copyright ownership.
5+
The ASF licenses this file to You under the Apache License, Version 2.0
6+
(the "License"); you may not use this file except in compliance with
7+
the License. You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
-->
17+
<html>
18+
<body>
19+
outside
20+
<iframe srcdoc="<p>this is the iframe content</p>"></iframe>
21+
</body>
22+
</html>

0 commit comments

Comments
 (0)