This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 6686a6f fix for TIKA-3139 contributed by wiwi (#328)
6686a6f is described below
commit 6686a6f9fbab19278e1bba4e9d143d3e74b8eef7
Author: jendabenda <[email protected]>
AuthorDate: Thu Jul 16 19:00:14 2020 +0200
fix for TIKA-3139 contributed by wiwi (#328)
Co-authored-by: Jan Prochazka <[email protected]>
---
.../src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java | 4 ++--
.../src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
index d136295..12cff95 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
@@ -106,7 +106,7 @@ class XFAExtractor {
}
if (namedFields.size() == 0) {
- xhtml.endElement("xfa_content");
+ xhtml.endElement("div");
return;
}
//now dump fields and values
@@ -135,7 +135,7 @@ class XFAExtractor {
}
xhtml.endElement("ol");
xhtml.endElement("div");
- xhtml.endElement("xfa_content");
+ xhtml.endElement("div");
}
//try to scrape the text until the endElement
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 57b6eb2..320689e 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1141,8 +1141,8 @@ public class PDFParserTest extends TikaTest {
config.setIfXFAExtractOnlyXFA(true);
context.set(PDFParserConfig.class, config);
String xml = getXML("testPDF_XFA_govdocs1_258578.pdf", context).xml;
+ assertContains("<body><div class=\"xfa_content\">", xml);
assertContains("<li fieldName=\"Room_1\">Room [1]: my_room1</li>",
xml);
- assertContains("</xfa_content></body></html>", xml);
assertNotContained("Mount Rushmore National Memorial", xml);
}