Author: nick Date: Fri Apr 27 13:59:49 2012 New Revision: 1331434 URL: http://svn.apache.org/viewvc?rev=1331434&view=rev Log: TIKA-861 Patch from Ryan Quam to enable extracting PDF Links. (Links are extracted for now at the end of the page, further work will be needed to match them to the text they apply to)
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1331434&r1=1331433&r2=1331434&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Fri Apr 27 13:59:49 2012 @@ -22,7 +22,10 @@ import java.io.Writer; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.util.PDFTextStripper; +import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction; +import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; import org.apache.pdfbox.util.TextPosition; import org.apache.tika.exception.TikaException; @@ -140,6 +143,23 @@ class PDF2XHTML extends PDFTextStripper // TODO: remove once PDFBOX-1143 is fixed: if (extractAnnotationText) { for(Object o : page.getAnnotations()) { + if( o instanceof PDAnnotationLink ) { + PDAnnotationLink annotationlink = (PDAnnotationLink) o; + if (annotationlink.getAction() != null) { + PDAction action = annotationlink.getAction(); + if( action instanceof PDActionURI ) { + PDActionURI uri = (PDActionURI) action; + String link = uri.getURI(); + if (link != null) { + handler.startElement("div", "class", "annotation"); + handler.startElement("a", "href", link); + handler.endElement("a"); + handler.endElement("div"); + } + } + } + } + if ((o instanceof PDAnnotation) && PDAnnotationMarkup.SUB_TYPE_FREETEXT.equals(((PDAnnotation) o).getSubtype())) { // It's a text annotation: PDAnnotationMarkup annot = (PDAnnotationMarkup) o; Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1331434&r1=1331433&r2=1331434&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri Apr 27 13:59:49 2012 @@ -308,6 +308,19 @@ public class PDFParserTest extends TikaT assertContains("<p>1</p>", content); } + /** + * Test to ensure that Links are extracted from the text + * + * Note - the PDF contains the text "This is a hyperlink" which + * a hyperlink annotation, linking to the tika site, on it. This + * test will need updating when we're able to apply the annotation + * to the text itself, rather than following on afterwards as now + */ + public void testLinks() throws Exception { + final XMLResult result = getXML("testPDFVarious.pdf"); + assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\"/></div>", result.xml); + } + public void testDisableAutoSpace() throws Exception { PDFParser parser = new PDFParser(); parser.setEnableAutoSpace(false);