Repository: tika Updated Branches: refs/heads/master da8363fe6 -> 7555b136d
TIKA-2259 -- improve url extraction from PDFs = copy Tilman Hausherr's code from PDFBOX-3644 Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7555b136 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7555b136 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7555b136 Branch: refs/heads/master Commit: 7555b136d9ba046e2007d1f305f707948fcbcbc3 Parents: da8363f Author: tballison <[email protected]> Authored: Thu Feb 2 14:32:29 2017 -0500 Committer: tballison <[email protected]> Committed: Thu Feb 2 14:32:29 2017 -0500 ---------------------------------------------------------------------- .../tika/parser/pdf/AbstractPDF2XHTML.java | 52 +++++++++++++------- 1 file changed, 33 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/7555b136/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index ead75db..944ae9c 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -25,6 +25,8 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -62,7 +64,6 @@ import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions import org.apache.pdfbox.pdmodel.interactive.action.PDPageAdditionalActions; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; -import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget; import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature; @@ -368,24 +369,15 @@ class AbstractPDF2XHTML extends PDFTextStripper { } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { - if (annotation instanceof PDAnnotationLink) { - PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; - if (annotationlink.getAction() != null) { - PDAction action = annotationlink.getAction(); - if (action instanceof PDActionURI) { - //can't currently associate link to text. - //for now, extract link and repeat the link as if it - //were the visible text - PDActionURI uri = (PDActionURI) action; - String link = uri.getURI(); - if (link != null && link.trim().length() > 0) { - xhtml.startElement("div", "class", "annotation"); - xhtml.startElement("a", "href", link); - xhtml.characters(link); - xhtml.endElement("a"); - xhtml.endElement("div"); - } - } + PDActionURI uri = getActionURI(annotation); + if (uri != null) { + String link = uri.getURI(); + if (link != null && link.trim().length() > 0) { + xhtml.startElement("div", "class", "annotation"); + xhtml.startElement("a", "href", link); + xhtml.characters(link); + xhtml.endElement("a"); + xhtml.endElement("div"); } } @@ -762,4 +754,26 @@ class AbstractPDF2XHTML extends PDFTextStripper { xhtml.endElement("li"); } } + + + private static PDActionURI getActionURI(PDAnnotation annot) { + //copied and pasted from PDFBox's PrintURLs + + // use reflection to catch all annotation types that have getAction() + // If you can't use reflection, then check for classes + // PDAnnotationLink and PDAnnotationWidget, and call getAction() and check for a + // PDActionURI result type + try { + Method actionMethod = annot.getClass().getDeclaredMethod("getAction"); + if (actionMethod.getReturnType().equals(PDAction.class)) { + PDAction action = (PDAction) actionMethod.invoke(annot); + if (action instanceof PDActionURI) { + return (PDActionURI) action; + } + } + } + catch (NoSuchMethodException|IllegalAccessException|InvocationTargetException e) { + } + return null; + } }
