This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new f5fa65f TIKA-2310 -- epub parser should parse contents ending in .xml
f5fa65f is described below
commit f5fa65fcc54f08e6ba57cb4b396753fbaa918c4a
Author: tballison <[email protected]>
AuthorDate: Fri Jan 17 15:25:48 2020 -0500
TIKA-2310 -- epub parser should parse contents ending in .xml
---
.../org/apache/tika/parser/epub/EpubParser.java | 21 +++++++++++++++++----
.../apache/tika/parser/epub/EpubParserTest.java | 12 +++++++++++-
.../resources/test-documents/testEPUB_xml_ext.epub | Bin 0 -> 7823 bytes
3 files changed, 28 insertions(+), 5 deletions(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 557d183..a38d45e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -151,7 +151,8 @@ public class EpubParser extends AbstractParser {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".htm") ||
entry.getName().endsWith(".html") ||
- entry.getName().endsWith(".xhtml")) {
+ entry.getName().endsWith(".xhtml") ||
+ entry.getName().endsWith(".xml")) {
content.parse(zip, bodyHandler, metadata, context);
}
entry = zip.getNextZipEntry();
@@ -276,10 +277,20 @@ public class EpubParser extends AbstractParser {
Set<String> processed = new HashSet<>();
for (String id : contentOrderScraper.contentItems) {
HRefMediaPair hRefMediaPair =
contentOrderScraper.locationMap.get(id);
- if (hRefMediaPair != null &&
- hRefMediaPair.href != null) {
+ if (hRefMediaPair != null && hRefMediaPair.href != null) {
+ //we need to test for xhtml/xml because the content parser
+ //expects that.
+ boolean shouldParse = false;
String href = hRefMediaPair.href.toLowerCase(Locale.US);
- if (href.endsWith("htm") || href.endsWith("html")) {
+ if (hRefMediaPair.media != null) {
+ String mediaType =
hRefMediaPair.media.toLowerCase(Locale.US);
+ if (mediaType.contains("html")) {
+ shouldParse = true;
+ }
+ } else if (href.endsWith("htm") || href.endsWith("html") ||
href.endsWith(".xml")) {
+ shouldParse = true;
+ }
+ if (shouldParse) {
zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
if (zae != null) {
try (InputStream is = zipFile.getInputStream(zae)) {
@@ -319,6 +330,8 @@ public class EpubParser extends AbstractParser {
return false;
} else if (lc.contains("x-ibooks")) {
return false;
+ } else if (lc.equals("application/x-dtbncx+xml")) {
+ return false;
}
return true;
}
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
index b3d2401..a16732c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
@@ -70,7 +70,8 @@ public class EpubParserTest extends TikaTest {
List<Metadata> metadataList = getRecursiveMetadata("testEPUB.epub");
//test attachments
- assertEquals(3, metadataList.size());
+ assertEquals(2, metadataList.size());
+ assertEquals("image/jpeg",
metadataList.get(1).get(Metadata.CONTENT_TYPE));
String xml =
metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT);
int tocIndex = xml.indexOf("h3 class=\"toc_heading\">Table of
Contents<");
int ch1 = xml.indexOf("<h1>Chapter 1");
@@ -106,4 +107,13 @@ public class EpubParserTest extends TikaTest {
int ch2 = xml.indexOf("<h1>Chapter 2");
assert(ch1 < ch2);
}
+
+ @Test
+ public void testContentsWXMLExtensions() throws Exception {
+ //TIKA-2310
+ List<Metadata> metadataList =
getRecursiveMetadata("testEPUB_xml_ext.epub");
+ assertEquals(1, metadataList.size());
+ assertContains("It was a bright cold day in April",
+
metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT));
+ }
}
diff --git
a/tika-parsers/src/test/resources/test-documents/testEPUB_xml_ext.epub
b/tika-parsers/src/test/resources/test-documents/testEPUB_xml_ext.epub
new file mode 100644
index 0000000..dca56d0
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testEPUB_xml_ext.epub differ