Repository: tika Updated Branches: refs/heads/master 23a11eff3 -> 95b2cd127
TIKA-2029: add some content for links so that we don't generate bad html <a href="http://tika.apache.org/"/> Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/95b2cd12 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/95b2cd12 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/95b2cd12 Branch: refs/heads/master Commit: 95b2cd127346486cece4cb1450f444fd9bd54337 Parents: 23a11ef Author: tballison <[email protected]> Authored: Wed Jul 6 16:20:45 2016 -0400 Committer: tballison <[email protected]> Committed: Wed Jul 6 16:20:45 2016 -0400 ---------------------------------------------------------------------- .../java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java | 6 +++++- .../test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/95b2cd12/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index d231a09..c3eafdc 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -308,11 +308,15 @@ class AbstractPDF2XHTML extends PDFTextStripper { if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { + //can't currently associate link to text. + //for now, extract link and repeat the link as if it + //were the visible text PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); - if (link != null) { + if (link != null && link.trim().length() > 0) { xhtml.startElement("div", "class", "annotation"); xhtml.startElement("a", "href", link); + xhtml.characters(link); xhtml.endElement("a"); xhtml.endElement("div"); } http://git-wip-us.apache.org/repos/asf/tika/blob/95b2cd12/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index be1f769..94b1548 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -387,7 +387,8 @@ public class PDFParserTest extends TikaTest { @Test public void testLinks() throws Exception { final XMLResult result = getXML("testPDFVarious.pdf"); - assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\" /></div>", result.xml); + assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\">"+ + "http://tika.apache.org/</a></div>", result.xml); } @Test
