TIKA-1508 proof of concept with on parameter on PDFParser
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/853750d4 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/853750d4 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/853750d4 Branch: refs/heads/master Commit: 853750d47fa99afad0df6d4f4727a35c96675254 Parents: 18ab8f9 Author: tballison <[email protected]> Authored: Fri Jun 3 16:32:48 2016 -0400 Committer: tballison <[email protected]> Committed: Fri Jun 3 16:32:48 2016 -0400 ---------------------------------------------------------------------- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 1 - .../org/apache/tika/parser/pdf/PDFParser.java | 17 ++++++++---- .../apache/tika/parser/pdf/PDFParserTest.java | 16 ++++++++++++ .../org/apache/tika/parser/pdf/tika-config.xml | 27 ++++++++++++++++++++ 4 files changed, 55 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/853750d4/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index ac9823e..34a3aff 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -106,7 +106,6 @@ class PDF2XHTML extends AbstractPDF2XHTML { // key methods to output to the given content // handler. pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config); - config.configure(pdf2XHTML); pdf2XHTML.writeText(document, new Writer() { http://git-wip-us.apache.org/repos/asf/tika/blob/853750d4/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 3e33962..bacc901 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -43,6 +43,7 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.common.PDMetadata; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; +import org.apache.tika.config.Field; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -55,6 +56,7 @@ import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ConfigurableParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.image.xmp.JempboxExtractor; @@ -64,6 +66,7 @@ import org.w3c.dom.Document; import org.xml.sax.ContentHandler; import org.xml.sax.ErrorHandler; import org.xml.sax.SAXException; +import static org.bouncycastle.asn1.x500.style.RFC4519Style.name; /** * PDF parser. @@ -83,7 +86,7 @@ import org.xml.sax.SAXException; * turn this feature on, see * {@link PDFParserConfig#setExtractInlineImages(boolean)}. */ -public class PDFParser extends AbstractParser { +public class PDFParser extends AbstractParser implements ConfigurableParser { /** @@ -102,12 +105,13 @@ public class PDFParser extends AbstractParser { Collections.singleton(MEDIA_TYPE); private PDFParserConfig defaultConfig = new PDFParserConfig(); - - public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } + @Field + private boolean sortByPosition = false; + public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) @@ -117,6 +121,8 @@ public class PDFParser extends AbstractParser { TemporaryResources tmp = new TemporaryResources(); //config from context, or default if not set via context PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); + //TODO: get rid of this after dev of TIKA-1508!!! + localConfig.setSortByPosition(sortByPosition); String password = ""; try { // PDFBox can process entirely in memory, or can use a temp file @@ -582,7 +588,7 @@ public class PDFParser extends AbstractParser { * @deprecated use {@link #getPDFParserConfig()} */ public boolean getSortByPosition() { - return defaultConfig.getSortByPosition(); + return sortByPosition; } /** @@ -595,8 +601,9 @@ public class PDFParser extends AbstractParser { * * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ + @Field public void setSortByPosition(boolean v) { - defaultConfig.setSortByPosition(v); + sortByPosition = v; } http://git-wip-us.apache.org/repos/asf/tika/blob/853750d4/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index df2e27c..ac54b11 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -16,9 +16,11 @@ */ package org.apache.tika.parser.pdf; +import static org.bouncycastle.crypto.tls.CipherType.stream; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -33,6 +35,7 @@ import org.apache.commons.io.IOUtils; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.AccessPermissionException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; @@ -1212,6 +1215,19 @@ public class PDFParserTest extends TikaTest { } + @Test + public void testInitializationViaConfig() throws Exception { + InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-config.xml"); + assertNotNull(is); + TikaConfig tikaConfig = new TikaConfig(is); + Parser p = new AutoDetectParser(tikaConfig); + String text = getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p); + text = text.replaceAll("\\s+", " "); + + // Column text is now interleaved: + assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", text); + + } private void assertException(String path, Parser parser, ParseContext context, Class expected) { boolean noEx = false; http://git-wip-us.apache.org/repos/asf/tika/blob/853750d4/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml new file mode 100644 index 0000000..0b965c7 --- /dev/null +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.pdf.PDFParser"> + <params> + <param name="sortByPosition" type="bool">true</param> + </params> + </parser> + + </parsers> +</properties>
