This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4567 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 3b1cd2eddfefc415f569a13ae09ceea7a3850eca Author: tallison <[email protected]> AuthorDate: Thu Dec 11 11:25:45 2025 -0500 TIKA-4567 -- further tweaks --- .../resources/tika-config-default-single-file.json | 5 +- .../org/apache/tika/config/ConfigDeserializer.java | 3 + .../src/test/resources/kafka/tika-config-kafka.xml | 72 --------- .../src/test/resources/tika-config-kafka.xml | 123 --------------- .../opensearch/tika-config-opensearch.json | 5 +- .../src/test/resources/s3/tika-config-s3.json | 5 +- .../src/test/resources/s3/tika-config-s3.xml | 68 -------- .../resources/tika-config-s3-integration-test.xml | 121 -------------- .../src/test/resources/tika-config-s3ToFs.xml | 37 ----- .../src/test/resources/tika-config-s3Tos3.xml | 47 ------ .../src/test/resources/solr/tika-config-solr.xml | 70 --------- .../src/test/resources/tika-config-solr-urls.json | 5 +- .../src/test/resources/tika-config-solr-urls.xml | 120 -------------- .../tika-parser-pdf-module/pom.xml | 6 + .../org/apache/tika/parser/pdf/AccessChecker.java | 129 --------------- .../java/org/apache/tika/parser/pdf/OcrConfig.java | 10 +- .../java/org/apache/tika/parser/pdf/PDFParser.java | 31 +++- .../apache/tika/parser/pdf/PDFParserConfig.java | 57 ++++++- .../pdf/image/ImageGraphicsEngineFactory.java | 19 --- .../apache/tika/parser/pdf/AccessCheckerTest.java | 138 ---------------- .../pdf/MyCustomImageGraphicsEngineFactory.java | 53 ++++++- .../org/apache/tika/parser/pdf/PDFParserTest.java | 173 ++++++++++----------- .../pdf/tika-config-custom-graphics-engine.json | 11 ++ .../pdf/tika-config-custom-graphics-engine.xml | 28 ---- .../tika/parser/pdf/tika-config-non-primitives.xml | 29 ---- .../org/apache/tika/parser/pdf/tika-config.json | 9 ++ .../org/apache/tika/parser/pdf/tika-config.xml | 26 ---- .../apache/tika/parser/pdf/tika-inline-config.json | 19 +++ .../apache/tika/parser/pdf/tika-inline-config.xml | 38 ----- .../org/apache/tika/parser/pdf/tika-ocr-config.xml | 36 ----- .../tika/parser/pdf/tika-rendering-config.xml | 34 ---- .../parser/pdf/tika-rendering-per-page-config.xml | 32 ---- .../tika/parser/pdf/tika-xml-profiler-config.xml | 24 --- .../serialization/ParseContextDeserializer.java | 2 +- 34 files changed, 274 insertions(+), 1311 deletions(-) diff --git a/tika-app/src/main/resources/tika-config-default-single-file.json b/tika-app/src/main/resources/tika-config-default-single-file.json index 696a8f6414..e9af227964 100644 --- a/tika-app/src/main/resources/tika-config-default-single-file.json +++ b/tika-app/src/main/resources/tika-config-default-single-file.json @@ -7,10 +7,7 @@ "pdf-parser": { "extractActions": true, "extractInlineImages": true, - "accessChecker": { - "needToCheck": true, - "allowExtractionForAccessibility": true - }, + "accessCheckMode": "ALLOW_EXTRACTION_FOR_ACCESSIBILITY", "extractIncrementalUpdateInfo": true, "parseIncrementalUpdates":true diff --git a/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java b/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java index e64b9ef82c..68e88cd40f 100644 --- a/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java +++ b/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java @@ -62,6 +62,9 @@ public class ConfigDeserializer { Method method = null; try { clazz = Class.forName("com.fasterxml.jackson.databind.ObjectMapper"); + // Use a plain ObjectMapper for simple config deserialization. + // The polymorphic mapper from tika-serialization is meant for ParseContext + // serialization with actual polymorphic types, not for simple config classes. instance = clazz.getDeclaredConstructor().newInstance(); method = clazz.getMethod("readValue", String.class, Class.class); } catch (Exception e) { diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/tika-config-kafka.xml b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/tika-config-kafka.xml deleted file mode 100644 index fb29c9ad6a..0000000000 --- a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/tika-config-kafka.xml +++ /dev/null @@ -1,72 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"> - <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> - <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/> - </parser> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="extractActions" type="bool">true</param> - <param name="accessChecker" type="org.apache.tika.parser.pdf.AccessChecker"> - <params> - <param name="needToCheck" type="bool">true</param> - <param name="allowExtractionForAccessibility" type="bool">true</param> - </params> - </param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> - <params> - <param name="includeDeletedContent" type="bool">true</param> - <param name="includeMoveFromContent" type="bool">true</param> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.OfficeParser"> - <params> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - </parsers> - <metadataFilters> - <!-- depending on the file format, some dates do not have a timezone. This - filter arbitrarily assumes dates have a UTC timezone and will format all - dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a timezone. - --> - <metadataFilter class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/> - <metadataFilter class="org.apache.tika.metadata.filter.FieldNameMappingFilter"> - <params> - <excludeUnmapped>true</excludeUnmapped> - <mappings> - <mapping from="X-TIKA:content" to="content_s"/> - <mapping from="Content-Length" to="length_i"/> - <mapping from="dc:creator" to="creators_ss"/> - <mapping from="dc:title" to="title_s"/> - <mapping from="Content-Type" to="mime_s"/> - <mapping from="X-TIKA:EXCEPTION:container_exception" to="tika_exception_s"/> - </mappings> - </params> - </metadataFilter> - </metadataFilters> -</properties> diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/tika-config-kafka.xml b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/tika-config-kafka.xml deleted file mode 100644 index 820e1bc7b5..0000000000 --- a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/tika-config-kafka.xml +++ /dev/null @@ -1,123 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"> - <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> - <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/> - </parser> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="extractActions" type="bool">true</param> - <param name="accessChecker" type="org.apache.tika.parser.pdf.AccessChecker"> - <params> - <param name="needToCheck" type="bool">true</param> - <param name="allowExtractionForAccessibility" type="bool">true</param> - </params> - </param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> - <params> - <param name="includeDeletedContent" type="bool">true</param> - <param name="includeMoveFromContent" type="bool">true</param> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.OfficeParser"> - <params> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - </parsers> - <metadataFilters> - <!-- depending on the file format, some dates do not have a timezone. This - filter arbitrarily assumes dates have a UTC timezone and will format all - dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a timezone. - --> - <metadataFilter class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/> - <metadataFilter class="org.apache.tika.metadata.filter.FieldNameMappingFilter"> - <params> - <excludeUnmapped>true</excludeUnmapped> - <mappings> - <mapping from="X-TIKA:content" to="content_s"/> - <mapping from="Content-Length" to="length_i"/> - <mapping from="dc:creator" to="creators_ss"/> - <mapping from="dc:title" to="title_s"/> - <mapping from="Content-Type" to="mime_s"/> - <mapping from="X-TIKA:EXCEPTION:container_exception" to="tika_exception_s"/> - </mappings> - </params> - </metadataFilter> - </metadataFilters> - <async> - <params> - <directEmitThresholdBytes>10000</directEmitThresholdBytes> - <emitMaxEstimatedBytes>100000</emitMaxEstimatedBytes> - <emitWithinMillis>10</emitWithinMillis> - <numEmitters>1</numEmitters> - <numClients>1</numClients> - <tikaConfig>{TIKA_CONFIG}</tikaConfig> - <forkedJvmArgs> - <arg>-Xmx1g</arg> - <arg>-XX:ParallelGCThreads=2</arg> - <arg>-XX:+ExitOnOutOfMemoryError</arg> - <arg>-Dlog4j.configurationFile={LOG4J_PROPERTIES_FILE}</arg> - </forkedJvmArgs> - <timeoutMillis>60000</timeoutMillis> - </params> - </async> - <fetchers> - <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher"> - <params> - <name>fsf</name> - <basePath>{PATH_TO_DOCS}</basePath> - </params> - </fetcher> - </fetchers> - <emitters> - <emitter class="org.apache.tika.pipes.emitter.kafka.KafkaEmitter"> - <params> - <name>ke</name> - <topic>{EMITTER_TOPIC}</topic> - <bootstrapServers>{BOOTSTRAP_SERVERS}</bootstrapServers> - </params> - </emitter> - <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter"> - <params> - <name>fse</name> - <basePath>/path/to/extracts</basePath> - </params> - </emitter> - </emitters> - <pipesIterator class="org.apache.tika.pipes.iterator.kafka.KafkaPipesIterator"> - <params> - <topic>{PIPE_ITERATOR_TOPIC}</topic> - <bootstrapServers>{BOOTSTRAP_SERVERS}</bootstrapServers> - <groupId>grpid</groupId> - <autoOffsetReset>earliest</autoOffsetReset> - <pollDelayMs>1000</pollDelayMs> - <fetcherName>fsf</fetcherName> - <emitterName>ke</emitterName> - </params> - </pipesIterator> -</properties> \ No newline at end of file diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json index 2381bca4ee..4a08cfa27d 100644 --- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json +++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json @@ -6,10 +6,7 @@ { "pdf-parser": { "extractActions": true, - "accessChecker": { - "needToCheck": true, - "allowExtractionForAccessibility": true - } + "accessCheckMode": "ALLOW_EXTRACTION_FOR_ACCESSIBILITY" } }, { diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json index 017a047b95..043da2349f 100644 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json +++ b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json @@ -15,10 +15,7 @@ { "pdf-parser": { "extractActions": true, - "accessChecker": { - "needToCheck": true, - "allowExtractionForAccessibility": true - } + "accessCheckMode": "ALLOW_EXTRACTION_FOR_ACCESSIBILITY" } }, { diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.xml b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.xml deleted file mode 100644 index c151bf907f..0000000000 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.xml +++ /dev/null @@ -1,68 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"> - <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> - <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/> - </parser> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="extractActions" type="bool">true</param> - <param name="accessChecker" type="org.apache.tika.parser.pdf.AccessChecker"> - <params> - <param name="needToCheck" type="bool">true</param> - <param name="allowExtractionForAccessibility" type="bool">true</param> - </params> - </param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> - <params> - <param name="includeDeletedContent" type="bool">true</param> - <param name="includeMoveFromContent" type="bool">true</param> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.OfficeParser"> - <params> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - </parsers> - <metadataFilters> - <metadataFilter class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/> - <metadataFilter class="org.apache.tika.metadata.filter.FieldNameMappingFilter"> - <params> - <excludeUnmapped>true</excludeUnmapped> - <mappings> - <mapping from="X-TIKA:content" to="content_s"/> - <mapping from="Content-Length" to="length_i"/> - <mapping from="dc:creator" to="creators_ss"/> - <mapping from="dc:title" to="title_s"/> - <mapping from="Content-Type" to="mime_s"/> - <mapping from="X-TIKA:EXCEPTION:container_exception" to="tika_exception_s"/> - </mappings> - </params> - </metadataFilter> - </metadataFilters> -</properties> diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3-integration-test.xml b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3-integration-test.xml deleted file mode 100644 index 7b361483c8..0000000000 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3-integration-test.xml +++ /dev/null @@ -1,121 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"> - <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> - <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/> - </parser> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="extractActions" type="bool">true</param> - <param name="accessChecker" type="org.apache.tika.parser.pdf.AccessChecker"> - <params> - <param name="needToCheck" type="bool">true</param> - <param name="allowExtractionForAccessibility" type="bool">true</param> - </params> - </param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> - <params> - <param name="includeDeletedContent" type="bool">true</param> - <param name="includeMoveFromContent" type="bool">true</param> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.OfficeParser"> - <params> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - </parsers> - <metadataFilters> - <!-- depending on the file format, some dates do not have a timezone. This - filter arbitrarily assumes dates have a UTC timezone and will format all - dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a timezone. - --> - <metadataFilter class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/> - <metadataFilter class="org.apache.tika.metadata.filter.FieldNameMappingFilter"> - <excludeUnmapped>true</excludeUnmapped> - <mappings> - <mapping from="X-TIKA:content" to="content_s"/> - <mapping from="Content-Length" to="length_i"/> - <mapping from="dc:creator" to="creators_ss"/> - <mapping from="dc:title" to="title_s"/> - <mapping from="Content-Type" to="mime_s"/> - <mapping from="X-TIKA:EXCEPTION:container_exception" to="tika_exception_s"/> - </mappings> - </metadataFilter> - </metadataFilters> - <async> - <directEmitThresholdBytes>10000</directEmitThresholdBytes> - <emitMaxEstimatedBytes>100000</emitMaxEstimatedBytes> - <emitWithinMillis>10</emitWithinMillis> - <numEmitters>1</numEmitters> - <numClients>1</numClients> - <tikaConfig>{TIKA_CONFIG}</tikaConfig> - <forkedJvmArgs> - <arg>-Xmx1g</arg> - <arg>-XX:ParallelGCThreads=2</arg> - <arg>-XX:+ExitOnOutOfMemoryError</arg> - <arg>-Dlog4j.configurationFile={LOG4J_PROPERTIES_FILE}</arg> - </forkedJvmArgs> - <timeoutMillis>60000</timeoutMillis> - </async> - <fetchers> - <fetcher class="org.apache.tika.pipes.fetcher.s3.S3Fetcher"> - <name>s3f</name> - <region>{REGION}</region> - <bucket>{FETCH_BUCKET}</bucket> - <credentialsProvider>key_secret</credentialsProvider> - <accessKey>{ACCESS_KEY}</accessKey> - <secretKey>{SECRET_KEY}</secretKey> - <endpointConfigurationService>{ENDPOINT_CONFIGURATION_SERVICE}</endpointConfigurationService> - <pathStyleAccessEnabled>true</pathStyleAccessEnabled> - <throttleSeconds>30,120,600,1200</throttleSeconds> - </fetcher> - </fetchers> - <pipesIterator class="org.apache.tika.pipes.iterator.S3PipesIterator"> - <emitterName>s3e</emitterName> - <fetcherName>s3f</fetcherName> - <region>{REGION}</region> - <bucket>{PIPE_ITERATOR_BUCKET}</bucket> - <credentialsProvider>key_secret</credentialsProvider> - <accessKey>{ACCESS_KEY}</accessKey> - <secretKey>{SECRET_KEY}</secretKey> - <endpointConfigurationService>{ENDPOINT_CONFIGURATION_SERVICE}</endpointConfigurationService> - <pathStyleAccessEnabled>true</pathStyleAccessEnabled> - </pipesIterator> - <emitters> - <emitter class="org.apache.tika.pipes.emitter.s3.S3Emitter"> - <name>s3e</name> - <region>{REGION}</region> - <bucket>{EMIT_BUCKET}</bucket> - <credentialsProvider>key_secret</credentialsProvider> - <accessKey>{ACCESS_KEY}</accessKey> - <secretKey>{SECRET_KEY}</secretKey> - <endpointConfigurationService>{ENDPOINT_CONFIGURATION_SERVICE}</endpointConfigurationService> - <pathStyleAccessEnabled>true</pathStyleAccessEnabled> - </emitter> - </emitters> -</properties> \ No newline at end of file diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3ToFs.xml b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3ToFs.xml deleted file mode 100644 index b02906f5d4..0000000000 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3ToFs.xml +++ /dev/null @@ -1,37 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<properties> - <fetchers> - <fetcher class="org.apache.tika.pipes.fetcher.s3.S3Fetcher"> - <name>s3f</name> - <region>us-east-1</region> - <profile>default</profile> - <bucket><!-- fill in here --></bucket> - <credentialsProvider>profile</credentialsProvider> - </fetcher> - </fetchers> - <pipesIterator class="org.apache.tika.pipes.iterator.S3PipesIterator"> - <fetcherName>s3f</fetcherName> - <bucket><!-- fill in here --></bucket> - <region>us-east-1</region> - <profile>default</profile> - <credentialsProvider>profile</credentialsProvider> - </pipesIterator> -</properties> \ No newline at end of file diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3Tos3.xml b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3Tos3.xml deleted file mode 100644 index fc30a56634..0000000000 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3Tos3.xml +++ /dev/null @@ -1,47 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<properties> - <fetchers> - <fetcher class="org.apache.tika.pipes.fetcher.s3.S3Fetcher"> - <name>s3f</name> - <region>us-east-1</region> - <bucket><!-- fill in here --></bucket> - <profile>default</profile> - <credentialsProvider>profile</credentialsProvider> - </fetcher> - </fetchers> - <pipesIterator class="org.apache.tika.pipes.iterator.S3PipesIterator"> - <fetcherName>s3f</fetcherName> - <region>us-east-1</region> - <bucket><!-- fill in here --></bucket> - <profile>default</profile> - <credentialsProvider>profile</credentialsProvider> - </pipesIterator> - <emitters> - <emitter class="org.apache.tika.pipes.emitter.s3.S3Emitter"> - <name>s3e</name> - <region>us-east-1</region> - <bucket><!-- fill in here --></bucket> - <profile>default</profile> - <fileExtension></fileExtension> - <credentialsProvider>profile</credentialsProvider> - </emitter> - </emitters> -</properties> \ No newline at end of file diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/tika-config-solr.xml b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/tika-config-solr.xml deleted file mode 100644 index 53ad77b463..0000000000 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/tika-config-solr.xml +++ /dev/null @@ -1,70 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"> - <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> - <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/> - </parser> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="extractActions" type="bool">true</param> - <param name="accessChecker" type="org.apache.tika.parser.pdf.AccessChecker"> - <params> - <param name="needToCheck" type="bool">true</param> - <param name="allowExtractionForAccessibility" type="bool">true</param> - </params> - </param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> - <params> - <param name="includeDeletedContent" type="bool">true</param> - <param name="includeMoveFromContent" type="bool">true</param> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.OfficeParser"> - <params> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - </parsers> - <metadataFilters> - <!-- depending on the file format, some dates do not have a timezone. This - filter arbitrarily assumes dates have a UTC timezone and will format all - dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a timezone. - --> - <metadataFilter class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/> - <metadataFilter class="org.apache.tika.metadata.filter.FieldNameMappingFilter"> - <excludeUnmapped>true</excludeUnmapped> - <mappings> - <mapping from="X-TIKA:content" to="content_s"/> - <mapping from="Content-Length" to="length_i"/> - <mapping from="dc:creator" to="creators_ss"/> - <mapping from="dc:title" to="title_s"/> - <mapping from="Content-Type" to="mime_s"/> - <mapping from="X-TIKA:EXCEPTION:container_exception" to="tika_exception_s"/> - </mappings> - </metadataFilter> - </metadataFilters> -</properties> diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.json b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.json index ea91e6c806..375cd94cc6 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.json +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.json @@ -6,10 +6,7 @@ { "pdf-parser": { "extractActions": true, - "accessChecker": { - "needToCheck": true, - "allowExtractionForAccessibility": true - } + "accessCheckMode": "ALLOW_EXTRACTION_FOR_ACCESSIBILITY" } }, { diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml deleted file mode 100644 index ba17c705cd..0000000000 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml +++ /dev/null @@ -1,120 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"> - <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> - <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/> - </parser> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="extractActions" type="bool">true</param> - <param name="accessChecker" type="org.apache.tika.parser.pdf.AccessChecker"> - <params> - <param name="needToCheck" type="bool">true</param> - <param name="allowExtractionForAccessibility" type="bool">true</param> - </params> - </param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> - <params> - <param name="includeDeletedContent" type="bool">true</param> - <param name="includeMoveFromContent" type="bool">true</param> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.OfficeParser"> - <params> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - </parsers> - <metadataFilters> - <!-- depending on the file format, some dates do not have a timezone. This - filter arbitrarily assumes dates have a UTC timezone and will format all - dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a timezone. - --> - <metadataFilter class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/> - <metadataFilter class="org.apache.tika.metadata.filter.FieldNameMappingFilter"> - <excludeUnmapped>true</excludeUnmapped> - <mappings> - <mapping from="X-TIKA:content" to="content_s"/> - <mapping from="Content-Length" to="length_i"/> - <mapping from="dc:creator" to="creators_ss"/> - <mapping from="dc:title" to="title_s"/> - <mapping from="Content-Type" to="mime_s"/> - <mapping from="X-TIKA:EXCEPTION:container_exception" to="tika_exception_s"/> - </mappings> - </metadataFilter> - </metadataFilters> - <async> - <directEmitThresholdBytes>10000</directEmitThresholdBytes> - <emitMaxEstimatedBytes>100000</emitMaxEstimatedBytes> - <emitWithinMillis>10</emitWithinMillis> - <numEmitters>1</numEmitters> - <numClients>1</numClients> - <tikaConfig>{TIKA_CONFIG}</tikaConfig> - <forkedJvmArgs> - <arg>-Xmx1g</arg> - <arg>-XX:ParallelGCThreads=2</arg> - <arg>-XX:+ExitOnOutOfMemoryError</arg> - <arg>-Dlog4j.configurationFile={LOG4J_PROPERTIES_FILE}</arg> - </forkedJvmArgs> - <timeoutMillis>60000</timeoutMillis> - </async> - <fetchers> - <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher"> - <name>fsf</name> - <basePath>{PATH_TO_DOCS}</basePath> - </fetcher> - </fetchers> - <emitters> - <emitter class="org.apache.tika.pipes.emitter.solr.SolrEmitter"> - <name>se</name> - {SOLR_CONNECTION} - <updateStrategy>{UPDATE_STRATEGY}</updateStrategy> - <solrCollection>testcol</solrCollection> - <attachmentStrategy>{ATTACHMENT_STRATEGY}</attachmentStrategy> - <commitWithin>1</commitWithin> - <idField>id</idField> - <connectionTimeout>10000</connectionTimeout> - <socketTimeout>60000</socketTimeout> - </emitter> - <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter"> - <name>fse</name> - <basePath>/path/to/extracts</basePath> - </emitter> - </emitters> - <pipesIterator class="org.apache.tika.pipes.emitter.solr.SolrPipesIterator"> - <solrCollection>testcol</solrCollection> - {SOLR_CONNECTION} - <idField>id</idField> - <parsingIdField>parsing_id_i</parsingIdField> - <failCountField>fail_count_i</failCountField> - <sizeFieldName>size_i</sizeFieldName> - <parseMode>{PARSE_MODE}</parseMode> - <rows>100</rows> - <fetcherName>fsf</fetcherName> - <emitterName>se</emitterName> - </pipesIterator> -</properties> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml index 8ceb9db51c..1fcf2e76e4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml @@ -87,6 +87,12 @@ <artifactId>jai-imageio-core</artifactId> <scope>test</scope> </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-serialization</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> </dependencies> <build> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java deleted file mode 100644 index 6c294ee5ca..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.pdf; - -import java.io.Serializable; - -import org.apache.tika.exception.AccessPermissionException; -import org.apache.tika.metadata.AccessPermissions; -import org.apache.tika.metadata.Metadata; - -/** - * Checks whether or not a document allows extraction generally - * or extraction for accessibility only. - */ -public class AccessChecker implements Serializable { - - private static final long serialVersionUID = 6492570218190936987L; - - /** - * Mode for checking document access permissions. - */ - public enum AccessCheckMode { - /** - * Don't check extraction permissions. Content will always be extracted - * regardless of document permissions. This is the default for backwards - * compatibility with Tika's legacy behavior (<= v1.7). - */ - DONT_CHECK, - - /** - * Check permissions, but allow extraction for accessibility purposes. - * If general extraction is blocked but accessibility extraction is allowed, - * content will be extracted. - */ - ALLOW_EXTRACTION_FOR_ACCESSIBILITY, - - /** - * Enforce document permissions strictly. If extraction is blocked, - * an {@link AccessPermissionException} will be thrown. - */ - ENFORCE_PERMISSIONS - } - - private AccessCheckMode mode; - - /** - * Constructs an {@link AccessChecker} with {@link AccessCheckMode#DONT_CHECK}. - * This will not perform any checking and will always return without - * throwing an exception. - * <p/> - * This constructor is available to allow for Tika's legacy (<= v1.7) behavior. - */ - public AccessChecker() { - this.mode = AccessCheckMode.DONT_CHECK; - } - - /** - * Constructs an {@link AccessChecker} with the specified mode. - * - * @param mode the access check mode - */ - public AccessChecker(AccessCheckMode mode) { - this.mode = mode; - } - - public AccessCheckMode getMode() { - return mode; - } - - public void setMode(AccessCheckMode mode) { - this.mode = mode; - } - - /** - * Checks to see if a document's content should be extracted based - * on metadata values and the configured {@link AccessCheckMode}. - * - * @param metadata the document metadata containing access permissions - * @throws AccessPermissionException if access is not permitted - */ - public void check(Metadata metadata) throws AccessPermissionException { - if (mode == AccessCheckMode.DONT_CHECK) { - return; - } - - if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) { - if (mode == AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY) { - if ("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) { - return; - } - throw new AccessPermissionException( - "Content extraction for accessibility is not allowed."); - } - throw new AccessPermissionException("Content extraction is not allowed."); - } - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - - AccessChecker checker = (AccessChecker) o; - return mode == checker.mode; - } - - @Override - public int hashCode() { - return mode != null ? mode.hashCode() : 0; - } -} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java index 9101b2f6ab..97bd59e5a6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java @@ -19,8 +19,6 @@ package org.apache.tika.parser.pdf; import java.io.Serializable; import java.util.Locale; -import org.apache.pdfbox.rendering.ImageType; - /** * Configuration for OCR processing in PDF parsing. * Groups all OCR-related settings together. @@ -180,6 +178,14 @@ public class OcrConfig implements Serializable { return imageFormat.getFormatName(); } + /** + * No-op setter for Jackson deserialization compatibility. + * The format name is derived from {@link #setImageFormat(ImageFormat)}. + */ + public void setImageFormatName(String imageFormatName) { + // Ignored - use setImageFormat instead + } + public float getImageQuality() { return imageQuality; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 99b2446dca..0dc2727ca0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -66,6 +66,7 @@ import org.apache.tika.config.JsonConfig; import org.apache.tika.config.Param; import org.apache.tika.config.ParseContextConfig; import org.apache.tika.config.TikaComponent; +import org.apache.tika.exception.AccessPermissionException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; @@ -225,8 +226,7 @@ public class PDFParser implements Parser, RenderingParser, Initializable { extractMetadata(pdfDocument, metadata, context); extractSignatures(pdfDocument, metadata); checkIllustrator(pdfDocument, metadata); - AccessChecker checker = localConfig.getAccessChecker(); - checker.check(metadata); + checkAccessPermissions(localConfig.getAccessCheckMode(), metadata); renderPagesBeforeParse(tstream, handler, metadata, context, localConfig); if (handler != null) { if (shouldHandleXFAOnly(hasXFA, localConfig)) { @@ -399,6 +399,25 @@ public class PDFParser implements Parser, RenderingParser, Initializable { //COSStream aiMetaData = privateDict.getCOSStream(COSName.AI_META_DATA); } + private void checkAccessPermissions(PDFParserConfig.AccessCheckMode mode, Metadata metadata) + throws AccessPermissionException { + if (mode == PDFParserConfig.AccessCheckMode.DONT_CHECK) { + return; + } + + if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) { + if (mode == PDFParserConfig.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY) { + if ("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) { + return; + } + throw new AccessPermissionException( + "Content extraction for accessibility is not allowed."); + } + // IGNORE_ACCESSIBILITY_ALLOWANCE - don't extract even if accessibility is allowed + throw new AccessPermissionException("Content extraction is not allowed."); + } + } + private void extractSignatures(PDDocument pdfDocument, Metadata metadata) { boolean hasSignature = false; for (PDSignature signature : pdfDocument.getSignatureDictionaries()) { @@ -980,12 +999,12 @@ public class PDFParser implements Parser, RenderingParser, Initializable { return defaultConfig.isIfXFAExtractOnlyXFA(); } @Field - public void setAccessCheckMode(AccessChecker.AccessCheckMode mode) { - defaultConfig.getAccessChecker().setMode(mode); + public void setAccessCheckMode(PDFParserConfig.AccessCheckMode mode) { + defaultConfig.setAccessCheckMode(mode); } - public AccessChecker.AccessCheckMode getAccessCheckMode() { - return defaultConfig.getAccessChecker().getMode(); + public PDFParserConfig.AccessCheckMode getAccessCheckMode() { + return defaultConfig.getAccessCheckMode(); } @Field diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index 6d238d2c33..584b59c513 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -36,6 +36,30 @@ public class PDFParserConfig implements Serializable { private static final long serialVersionUID = 6492570218190936986L; + /** + * Mode for checking document access permissions. + */ + public enum AccessCheckMode { + /** + * Don't check extraction permissions. Content will always be extracted + * regardless of document permissions. This is the default for backwards + * compatibility with Tika's legacy behavior (<= v1.7). + */ + DONT_CHECK, + + /** + * Check permissions, but allow extraction for accessibility purposes if + * extraction for accessibility is allowed. + */ + ALLOW_EXTRACTION_FOR_ACCESSIBILITY, + + /** + * If extraction is blocked, throw an {@link org.apache.tika.exception.AccessPermissionException} + * even if the document allows extraction for accessibility. + */ + IGNORE_ACCESSIBILITY_ALLOWANCE + } + // True if we let PDFBox "guess" where spaces should go: private boolean enableAutoSpace = true; @@ -99,7 +123,7 @@ public class PDFParserConfig implements Serializable { * Should the entire document be rendered? */ private IMAGE_STRATEGY imageStrategy = IMAGE_STRATEGY.NONE; - private AccessChecker accessChecker = new AccessChecker(); + private AccessCheckMode accessCheckMode = AccessCheckMode.DONT_CHECK; //The PDFParser can throw IOExceptions if there is a problem //with a streams. If this is set to true, Tika's @@ -452,12 +476,12 @@ public class PDFParserConfig implements Serializable { this.dropThreshold = dropThreshold; } - public AccessChecker getAccessChecker() { - return accessChecker; + public AccessCheckMode getAccessCheckMode() { + return accessCheckMode; } - public void setAccessChecker(AccessChecker accessChecker) { - this.accessChecker = accessChecker; + public void setAccessCheckMode(AccessCheckMode accessCheckMode) { + this.accessCheckMode = accessCheckMode; } /** @@ -543,6 +567,14 @@ public class PDFParserConfig implements Serializable { return ocr.getImageFormatName(); } + /** + * No-op setter for Jackson deserialization compatibility. + * Use {@link #setOcrImageFormat(OcrConfig.ImageFormat)} instead. + */ + public void setOcrImageFormatName(String ocrImageFormatName) { + // Ignored - use setOcrImageFormat instead + } + public OcrConfig.ImageFormat getOcrImageFormat() { return ocr.getImageFormat(); } @@ -667,6 +699,21 @@ public class PDFParserConfig implements Serializable { this.imageGraphicsEngineFactory = imageGraphicsEngineFactory; } + /** + * EXPERT: Customize the class that handles inline images within a PDF page. + * Use this setter when specifying the factory class name in JSON config. + * + * @param className fully qualified class name of an ImageGraphicsEngineFactory implementation + */ + public void setImageGraphicsEngineFactoryClass(String className) { + try { + Class<?> clazz = Class.forName(className); + this.imageGraphicsEngineFactory = (ImageGraphicsEngineFactory) clazz.getDeclaredConstructor().newInstance(); + } catch (Exception e) { + throw new RuntimeException("Failed to instantiate ImageGraphicsEngineFactory: " + className, e); + } + } + public ImageGraphicsEngineFactory getImageGraphicsEngineFactory() { return imageGraphicsEngineFactory; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java index f0cdd0811e..890f7beeef 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java @@ -42,23 +42,4 @@ public class ImageGraphicsEngineFactory implements Serializable { processedInlineImages, imageCounter, xhtml, parentMetadata, parseContext); } - /** - * Returns the factory type for serialization purposes. - * This allows Jackson to serialize the factory object without requiring additional dependencies. - * - * @return the fully qualified class name of this factory - */ - public String getFactoryType() { - return getClass().getName(); - } - - /** - * Setter for factory type to complete the JavaBean pattern for Jackson deserialization. - * This is a no-op since the factory type is derived from the class itself. - * - * @param factoryType the factory type (ignored) - */ - public void setFactoryType(String factoryType) { - // No-op: factory type is determined by the class, not set externally - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java deleted file mode 100644 index f4b7e75706..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.pdf; - -import static org.junit.jupiter.api.Assertions.assertTrue; - -import org.junit.jupiter.api.Test; - -import org.apache.tika.exception.AccessPermissionException; -import org.apache.tika.metadata.AccessPermissions; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.PropertyTypeException; - -public class AccessCheckerTest { - - @Test - public void testDontCheck() throws AccessPermissionException { - //test that there are no thrown exceptions with DONT_CHECK mode - Metadata m = getMetadata(false, false); - //legacy behavior; don't bother checking - AccessChecker checker = new AccessChecker(); - checker.check(m); - - m = getMetadata(false, true); - checker.check(m); - - m = getMetadata(true, true); - checker.check(m); - - // Explicitly set DONT_CHECK mode - checker = new AccessChecker(AccessChecker.AccessCheckMode.DONT_CHECK); - m = getMetadata(false, false); - checker.check(m); - } - - @Test - public void testEnforcePermissions() { - Metadata m = null; - // ENFORCE_PERMISSIONS - no extraction allowed if blocked - AccessChecker checker = new AccessChecker(AccessChecker.AccessCheckMode.ENFORCE_PERMISSIONS); - boolean ex = false; - try { - m = getMetadata(false, false); - checker.check(m); - } catch (AccessPermissionException e) { - ex = true; - } - assertTrue(ex, "correct exception with no extraction, no extract for accessibility"); - ex = false; - try { - //document allows extraction for accessibility - m = getMetadata(false, true); - checker.check(m); - } catch (AccessPermissionException e) { - //but ENFORCE_PERMISSIONS mode doesn't allow it - ex = true; - } - assertTrue(ex, "correct exception with no extraction, enforce permissions"); - } - - @Test - public void testAllowExtractionForAccessibility() throws AccessPermissionException { - Metadata m = getMetadata(false, true); - // ALLOW_EXTRACTION_FOR_ACCESSIBILITY mode - AccessChecker checker = new AccessChecker(AccessChecker.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY); - checker.check(m); - assertTrue(true, "no exception"); - boolean ex = false; - try { - m = getMetadata(false, false); - checker.check(m); - } catch (AccessPermissionException e) { - ex = true; - } - assertTrue(ex, "correct exception"); - } - - @Test - public void testIllogicalExtractNotForAccessibility() throws AccessPermissionException { - Metadata m = getMetadata(true, false); - // ALLOW_EXTRACTION_FOR_ACCESSIBILITY mode - AccessChecker checker = new AccessChecker(AccessChecker.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY); - checker.check(m); - assertTrue(true, "no exception"); - - // ENFORCE_PERMISSIONS mode - checker = new AccessChecker(AccessChecker.AccessCheckMode.ENFORCE_PERMISSIONS); - //if extract content is allowed, the checker shouldn't - //check the value of extract for accessibility - checker.check(m); - assertTrue(true, "no exception"); - } - - @Test - public void testCantAddMultiplesToMetadata() { - Metadata m = new Metadata(); - boolean ex = false; - m.add(AccessPermissions.EXTRACT_CONTENT, "true"); - try { - m.add(AccessPermissions.EXTRACT_CONTENT, "false"); - } catch (PropertyTypeException e) { - ex = true; - } - assertTrue(ex, "can't add multiple values"); - - m = new Metadata(); - ex = false; - m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "true"); - try { - m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "false"); - } catch (PropertyTypeException e) { - ex = true; - } - assertTrue(ex, "can't add multiple values"); - } - - private Metadata getMetadata(boolean allowExtraction, boolean allowExtractionForAccessibility) { - Metadata m = new Metadata(); - m.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(allowExtraction)); - m.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, - Boolean.toString(allowExtractionForAccessibility)); - return m; - } -} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java index e1bf5d5000..aef622b400 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java @@ -19,6 +19,7 @@ package org.apache.tika.parser.pdf; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; +import com.fasterxml.jackson.annotation.JsonTypeInfo; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.pdmodel.PDPage; @@ -29,8 +30,52 @@ import org.apache.tika.parser.pdf.image.ImageGraphicsEngine; import org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory; import org.apache.tika.sax.XHTMLContentHandler; +/** + * Example custom ImageGraphicsEngineFactory demonstrating how users can create + * their own factory implementations with custom configuration parameters. + * <p> + * <b>JSON Config File Usage:</b> Use the class name string approach: + * <pre> + * { + * "pdf-parser": { + * "imageGraphicsEngineFactoryClass": "com.example.MyCustomFactory" + * } + * } + * </pre> + * Note: This approach does not support custom parameters; the factory will use default values. + * <p> + * <b>ParseContext Serialization:</b> The {@code @JsonTypeInfo} annotation enables polymorphic + * serialization when using tika-serialization's polymorphic ObjectMapper (e.g., for + * ParseContext round-trip serialization). This requires the annotation on both the base + * class and subclass for full polymorphic support. + */ +@JsonTypeInfo(use = JsonTypeInfo.Id.CLASS, property = "@class") public class MyCustomImageGraphicsEngineFactory extends ImageGraphicsEngineFactory { + /** + * Metadata key used to record that this custom factory was used during parsing. + */ + public static final String CUSTOM_FACTORY_USED = "X-CustomGraphicsEngineFactory-Used"; + + /** + * Metadata key used to record the customParam value. + */ + public static final String CUSTOM_PARAM_KEY = "X-CustomGraphicsEngineFactory-CustomParam"; + + private String customParam = "default"; + + public MyCustomImageGraphicsEngineFactory() { + // Default constructor required for Jackson deserialization + } + + public String getCustomParam() { + return customParam; + } + + public void setCustomParam(String customParam) { + this.customParam = customParam; + } + @Override public ImageGraphicsEngine newEngine(PDPage page, int pageNumber, @@ -39,6 +84,12 @@ public class MyCustomImageGraphicsEngineFactory extends ImageGraphicsEngineFacto Map<COSStream, Integer> processedInlineImages, AtomicInteger imageCounter, XHTMLContentHandler xhtml, Metadata parentMetadata, ParseContext parseContext) { - throw new RuntimeException("testing123"); + // Record that this custom factory was used + parentMetadata.set(CUSTOM_FACTORY_USED, "true"); + parentMetadata.set(CUSTOM_PARAM_KEY, customParam); + + // Delegate to the default implementation + return super.newEngine(page, pageNumber, embeddedDocumentExtractor, pdfParserConfig, + processedInlineImages, imageCounter, xhtml, parentMetadata, parseContext); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index b617b77d79..3740e65399 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -19,13 +19,12 @@ package org.apache.tika.parser.pdf; import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; import java.io.InputStream; +import java.nio.file.Path; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -46,10 +45,9 @@ import org.xml.sax.ContentHandler; import org.apache.tika.Tika; import org.apache.tika.TikaTest; -import org.apache.tika.config.TikaConfig; +import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.exception.AccessPermissionException; import org.apache.tika.exception.EncryptedDocumentException; -import org.apache.tika.exception.TikaException; import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.metadata.Font; @@ -70,7 +68,6 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; -import org.apache.tika.utils.ExceptionUtils; /** * Test case for parsing pdf files. @@ -883,7 +880,7 @@ public class PDFParserTest extends TikaTest { PDFParserConfig config = new PDFParserConfig(); //don't allow extraction, not even for accessibility - config.setAccessChecker(new AccessChecker(false)); + config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.IGNORE_ACCESSIBILITY_ALLOWANCE); ParseContext context = new ParseContext(); context.set(PDFParserConfig.class, config); @@ -894,7 +891,7 @@ public class PDFParserTest extends TikaTest { AccessPermissionException.class); } - config.setAccessChecker(new AccessChecker(true)); + config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY); assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_empty.pdf", AUTO_DETECT_PARSER, context, AccessPermissionException.class); @@ -908,7 +905,7 @@ public class PDFParserTest extends TikaTest { PDFParserConfig config = new PDFParserConfig(); //don't allow extraction, not even for accessibility - config.setAccessChecker(new AccessChecker(false)); + config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.IGNORE_ACCESSIBILITY_ALLOWANCE); PasswordProvider passwordProvider = new PasswordProvider() { @Override public String getPassword(Metadata metadata) { @@ -927,7 +924,7 @@ public class PDFParserTest extends TikaTest { } //bad password is still a bad password - config.setAccessChecker(new AccessChecker(true)); + config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY); for (String path : new String[]{"testPDF_no_extract_no_accessibility_owner_empty.pdf", "testPDF_no_extract_yes_accessibility_owner_empty.pdf",}) { assertException("/test-documents/" + path, AUTO_DETECT_PARSER, context, @@ -941,7 +938,7 @@ public class PDFParserTest extends TikaTest { assertContains("Hello World", getXML("testPDF_no_extract_yes_accessibility_owner_user.pdf", context).xml); - config.setAccessChecker(new AccessChecker(false)); + config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.IGNORE_ACCESSIBILITY_ALLOWANCE); for (String path : new String[]{"testPDF_no_extract_no_accessibility_owner_user.pdf", "testPDF_no_extract_yes_accessibility_owner_user.pdf",}) { assertException("/test-documents/" + path, AUTO_DETECT_PARSER, context, @@ -955,7 +952,7 @@ public class PDFParserTest extends TikaTest { PDFParserConfig config = new PDFParserConfig(); //don't allow extraction, not even for accessibility - config.setAccessChecker(new AccessChecker(true)); + config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY); PasswordProvider passwordProvider = new PasswordProvider() { @Override public String getPassword(Metadata metadata) { @@ -977,7 +974,7 @@ public class PDFParserTest extends TikaTest { } //really, with owner's password, all extraction is allowed - config.setAccessChecker(new AccessChecker(false)); + config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.IGNORE_ACCESSIBILITY_ALLOWANCE); for (String path : new String[]{"testPDF_no_extract_no_accessibility_owner_user.pdf", "testPDF_no_extract_yes_accessibility_owner_user.pdf", "testPDF_no_extract_no_accessibility_owner_empty.pdf", @@ -1101,45 +1098,43 @@ public class PDFParserTest extends TikaTest { @Test public void testInitializationViaConfig() throws Exception { - try (InputStream is = getResourceAsStream( - "/org/apache/tika/parser/pdf/tika-config.xml")) { - assertNotNull(is); - TikaConfig tikaConfig = new TikaConfig(is); - Parser p = new AutoDetectParser(tikaConfig); - - String text = - getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p); - text = text.replaceAll("\\s+", " "); - - // Column text is now interleaved: - assertContains( - "Left column line 1 Right column line 1 " + - "Left colu mn line 2 Right column line 2", - text); - - //test overriding underlying settings with PDFParserConfig - ParseContext pc = new ParseContext(); - PDFParserConfig config = new PDFParserConfig(); - config.setSortByPosition(false); - pc.set(PDFParserConfig.class, config); - text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc); - text = text.replaceAll("\\s+", " "); - // Column text is not interleaved: - assertContains("Left column line 1 Left column line 2 ", text); - - //test a new PDFParserConfig and setting another value - //this tests that a new PDFParserConfig completely resets - //behavior - config = new PDFParserConfig(); - config.setOcrDPI(10000); - config.setOcrStrategy(OcrConfig.Strategy.NO_OCR); - pc.set(PDFParserConfig.class, config); - text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc); - text = text.replaceAll("\\s+", " "); - - // Column text is not interleaved: - assertContains("Left column line 1 Left column line 2 ", text); - } + Path configPath = Path.of(getClass().getResource( + "/org/apache/tika/parser/pdf/tika-config.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + Parser p = loader.loadAutoDetectParser(); + + String text = + getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p); + text = text.replaceAll("\\s+", " "); + + // Column text is now interleaved: + assertContains( + "Left column line 1 Right column line 1 " + + "Left colu mn line 2 Right column line 2", + text); + + //test overriding underlying settings with PDFParserConfig + ParseContext pc = new ParseContext(); + PDFParserConfig config = new PDFParserConfig(); + config.setSortByPosition(false); + pc.set(PDFParserConfig.class, config); + text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc); + text = text.replaceAll("\\s+", " "); + // Column text is not interleaved: + assertContains("Left column line 1 Left column line 2 ", text); + + //test a new PDFParserConfig and setting another value + //this tests that a new PDFParserConfig completely resets + //behavior + config = new PDFParserConfig(); + config.setOcrDPI(10000); + config.setOcrStrategy(OcrConfig.Strategy.NO_OCR); + pc.set(PDFParserConfig.class, config); + text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc); + text = text.replaceAll("\\s+", " "); + + // Column text is not interleaved: + assertContains("Left column line 1 Left column line 2 ", text); } // Moved to tika-parsers-standard-package PDFParserTest.testInitializationOfNonPrimitivesViaJsonConfig @@ -1162,30 +1157,28 @@ public class PDFParserTest extends TikaTest { @Test public void testConfiguringMoreParams() throws Exception { - try (InputStream configIs = getResourceAsStream( - "/org/apache/tika/parser/pdf/tika-inline-config.xml")) { - assertNotNull(configIs); - TikaConfig tikaConfig = new TikaConfig(configIs); - AutoDetectParser p = new AutoDetectParser(tikaConfig); - //make absolutely certain the functionality works! - List<Metadata> metadata = getRecursiveMetadata("testOCR.pdf", p); - assertEquals(2, metadata.size()); - Map<MediaType, Parser> parsers = p.getParsers(); - Parser composite = parsers.get(MediaType.application("pdf")); - Parser pdfParser = - ((CompositeParser) composite).getParsers().get(MediaType.application("pdf")); - assertTrue(pdfParser instanceof PDFParser); - PDFParserConfig pdfParserConfig = ((PDFParser) pdfParser).getPDFParserConfig(); - assertEquals(new AccessChecker(true), pdfParserConfig.getAccessChecker()); - assertEquals(true, pdfParserConfig.isExtractInlineImages()); - assertEquals(false, pdfParserConfig.isExtractUniqueInlineImagesOnly()); - assertEquals(314, pdfParserConfig.getOcrDPI()); - assertEquals(2.1f, pdfParserConfig.getOcrImageQuality(), .01f); - assertEquals("jpeg", pdfParserConfig.getOcrImageFormatName()); - assertEquals(524288000, pdfParserConfig.getMaxMainMemoryBytes()); - assertEquals(false, pdfParserConfig.isCatchIntermediateIOExceptions()); - - } + Path configPath = Path.of(getClass().getResource( + "/org/apache/tika/parser/pdf/tika-inline-config.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + AutoDetectParser p = (AutoDetectParser) loader.loadAutoDetectParser(); + //make absolutely certain the functionality works! + List<Metadata> metadata = getRecursiveMetadata("testOCR.pdf", p); + assertEquals(2, metadata.size()); + Map<MediaType, Parser> parsers = p.getParsers(); + Parser composite = parsers.get(MediaType.application("pdf")); + Parser pdfParser = + ((CompositeParser) composite).getParsers().get(MediaType.application("pdf")); + assertTrue(pdfParser instanceof PDFParser); + PDFParserConfig pdfParserConfig = ((PDFParser) pdfParser).getPDFParserConfig(); + assertEquals(PDFParserConfig.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY, + pdfParserConfig.getAccessCheckMode()); + assertEquals(true, pdfParserConfig.isExtractInlineImages()); + assertEquals(false, pdfParserConfig.isExtractUniqueInlineImagesOnly()); + assertEquals(314, pdfParserConfig.getOcrDPI()); + assertEquals(2.1f, pdfParserConfig.getOcrImageQuality(), .01f); + assertEquals("jpeg", pdfParserConfig.getOcrImageFormatName()); + assertEquals(524288000, pdfParserConfig.getMaxMainMemoryBytes()); + assertEquals(false, pdfParserConfig.isCatchIntermediateIOExceptions()); } //TODO: figure out how to test jp2 embedded with OCR @@ -1395,20 +1388,22 @@ public class PDFParserTest extends TikaTest { @Test public void testCustomGraphicsEngineFactory() throws Exception { - try (InputStream is = - getResourceAsStream( - "tika-config-custom-graphics-engine.xml")) { - assertNotNull(is); - TikaConfig tikaConfig = new TikaConfig(is); - Parser p = new AutoDetectParser(tikaConfig); - try { - List<Metadata> metadataList = getRecursiveMetadata("testPDF_JBIG2.pdf", p); - fail("should have thrown a runtime exception"); - } catch (TikaException e) { - String stack = ExceptionUtils.getStackTrace(e); - assertContains("testing123", stack); - } - } + Path configPath = Path.of(getClass().getResource( + "tika-config-custom-graphics-engine.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + Parser p = loader.loadAutoDetectParser(); + + // Parse a PDF with inline images to trigger the custom graphics engine factory + List<Metadata> metadataList = getRecursiveMetadata("testPDF_JBIG2.pdf", p); + + // Verify the custom factory was used + // Note: customParam uses default value since JSON config uses class name string + // (polymorphic config with params requires @JsonTypeInfo on base class) + Metadata metadata = metadataList.get(0); + assertEquals("true", metadata.get(MyCustomImageGraphicsEngineFactory.CUSTOM_FACTORY_USED), + "Custom graphics engine factory should have been used"); + assertEquals("default", metadata.get(MyCustomImageGraphicsEngineFactory.CUSTOM_PARAM_KEY), + "customParam should have default value when using class name string config"); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.json new file mode 100644 index 0000000000..2411ca211e --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.json @@ -0,0 +1,11 @@ +{ + "parsers": [ + { + "pdf-parser": { + "sortByPosition": true, + "extractInlineImages": true, + "imageGraphicsEngineFactoryClass": "org.apache.tika.parser.pdf.MyCustomImageGraphicsEngineFactory" + } + } + ] +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.xml deleted file mode 100644 index 5aa259feeb..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.xml +++ /dev/null @@ -1,28 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="sortByPosition" type="bool">true</param> - <param name="extractInlineImages" type="bool">true</param> - <param name="imageGraphicsEngineFactory" class="org.apache.tika.parser.pdf.MyCustomImageGraphicsEngineFactory"/> - </params> - </parser> - </parsers> -</properties> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml deleted file mode 100644 index 3cc9d8b237..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml +++ /dev/null @@ -1,29 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="sortByPosition" type="bool">true</param> - <param name="ocrImageType" type="string">rgb</param> - <param name="ocrStrategy" type="string">ocr_only</param> - - </params> - </parser> - </parsers> -</properties> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.json new file mode 100644 index 0000000000..a16c7e4d96 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.json @@ -0,0 +1,9 @@ +{ + "parsers": [ + { + "pdf-parser": { + "sortByPosition": true + } + } + ] +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml deleted file mode 100644 index 98940da24b..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml +++ /dev/null @@ -1,26 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="sortByPosition" type="bool">true</param> - </params> - </parser> - </parsers> -</properties> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.json new file mode 100644 index 0000000000..deaea70b79 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.json @@ -0,0 +1,19 @@ +{ + "parsers": [ + { + "default-parser": {} + }, + { + "pdf-parser": { + "extractInlineImages": true, + "accessCheckMode": "ALLOW_EXTRACTION_FOR_ACCESSIBILITY", + "catchIntermediateIOExceptions": false, + "extractUniqueInlineImagesOnly": false, + "ocrDPI": 314, + "ocrImageQuality": 2.1, + "ocrImageFormat": "JPEG", + "maxMainMemoryBytes": 524288000 + } + } + ] +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml deleted file mode 100644 index bffe8be380..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml +++ /dev/null @@ -1,38 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"/> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="extractInlineImages" type="bool">true</param> - <param name="accessCheckMode" type="string">ALLOW_EXTRACTION_FOR_ACCESSIBILITY</param> - <param name="catchIntermediateExceptions" type="bool">false</param> - <param name="extractUniqueInlineImagesOnly" type="bool">false</param> - <param name="catchIntermediateExceptions" type="bool">false</param> - <param name="ocrDPI" type="int">314</param> - <param name="ocrImageQuality" type="float">2.1</param> - <param name="ocrImageFormat" type="string">JPEG</param> - <param name="ocrImageScale" type="float">1.3</param> - <param name="maxMainMemoryBytes" type="long">524288000</param> - <!-- we really should throw an exception for this!! --> - <param name="someRandomThingOrOther" type="bool">true</param> - </params> - </parser> - </parsers> -</properties> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml deleted file mode 100644 index e187601190..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml +++ /dev/null @@ -1,36 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"> - <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> - <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> - </parser> - <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"> - <params> - <param name="maxFileSizeToOcr" type="long">100</param> - </params> - </parser> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="extractInlineImages" type="bool">false</param> - <param name="ocrStrategy" type="string">ocr_only</param> - </params> - </parser> - </parsers> -</properties> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml deleted file mode 100644 index 92f351bb9a..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml +++ /dev/null @@ -1,34 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"> - <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> - </parser> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="imageStrategy" type="string">renderPagesBeforeParse</param> - </params> - </parser> - </parsers> -<!-- - This will be supplied automatically if not specified. - <renderers> - <renderer class="org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer"/> - </renderers> --> -</properties> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-per-page-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-per-page-config.xml deleted file mode 100644 index e3f92df6c9..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-per-page-config.xml +++ /dev/null @@ -1,32 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"> - <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> - </parser> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="imageStrategy" type="string">renderPagesAtPageEnd</param> - </params> - </parser> - </parsers> -<!-- <renderers> - <renderer class="org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer"/> - </renderers> --> -</properties> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml deleted file mode 100644 index 20adbf2880..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml +++ /dev/null @@ -1,24 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"> - </parser> - <parser class="org.apache.tika.parser.xml.XMLProfiler"/> - </parsers> -</properties> diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java index 011f149cc5..518bffd160 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java @@ -104,7 +104,7 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { } catch (ClassNotFoundException e) { LOG.debug("Class not found for key '{}', storing in ConfigContainer", fieldName); } catch (Exception e) { - LOG.warn("Failed to deserialize '{}' directly, storing in ConfigContainer", fieldName, e); + throw new IOException("Failed to deserialize '" + fieldName + "': " + e.getMessage(), e); } }
