This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new a66b33f6b2 TIKA-4567 -- update PDFParserConfig for the new world
a66b33f6b2 is described below
commit a66b33f6b269e52473608e1764eb94c84fac7467
Author: Tim Allison <[email protected]>
AuthorDate: Thu Dec 11 12:10:52 2025 -0500
TIKA-4567 -- update PDFParserConfig for the new world
---
.../resources/tika-config-default-single-file.json | 5 +-
.../org/apache/tika/config/ConfigDeserializer.java | 5 +
.../src/test/resources/kafka/tika-config-kafka.xml | 72 -----
.../src/test/resources/tika-config-kafka.xml | 123 ---------
.../opensearch/tika-config-opensearch.json | 5 +-
.../src/test/resources/s3/tika-config-s3.json | 5 +-
.../src/test/resources/s3/tika-config-s3.xml | 68 -----
.../resources/tika-config-s3-integration-test.xml | 121 --------
.../src/test/resources/tika-config-s3ToFs.xml | 37 ---
.../src/test/resources/tika-config-s3Tos3.xml | 47 ----
.../src/test/resources/solr/tika-config-solr.xml | 70 -----
.../src/test/resources/tika-config-solr-urls.json | 5 +-
.../src/test/resources/tika-config-solr-urls.xml | 120 --------
.../tika-parser-pdf-module/pom.xml | 6 +
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 22 +-
.../org/apache/tika/parser/pdf/AccessChecker.java | 123 ---------
.../java/org/apache/tika/parser/pdf/OCR2XHTML.java | 2 +-
.../java/org/apache/tika/parser/pdf/OcrConfig.java | 181 ++++++++++++
.../java/org/apache/tika/parser/pdf/PDFParser.java | 67 +++--
.../apache/tika/parser/pdf/PDFParserConfig.java | 303 ++++++---------------
.../pdf/image/ImageGraphicsEngineFactory.java | 19 --
.../tika/renderer/pdf/pdfbox/PDFBoxRenderer.java | 4 +-
.../apache/tika/parser/pdf/AccessCheckerTest.java | 135 ---------
.../pdf/MyCustomImageGraphicsEngineFactory.java | 53 +++-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 173 ++++++------
.../pdf/tika-config-custom-graphics-engine.json | 11 +
.../pdf/tika-config-custom-graphics-engine.xml | 28 --
.../tika/parser/pdf/tika-config-non-primitives.xml | 29 --
.../org/apache/tika/parser/pdf/tika-config.json | 9 +
.../org/apache/tika/parser/pdf/tika-config.xml | 26 --
.../apache/tika/parser/pdf/tika-inline-config.json | 19 ++
.../apache/tika/parser/pdf/tika-inline-config.xml | 38 ---
.../org/apache/tika/parser/pdf/tika-ocr-config.xml | 36 ---
.../tika/parser/pdf/tika-rendering-config.xml | 34 ---
.../parser/pdf/tika-rendering-per-page-config.xml | 32 ---
.../tika/parser/pdf/tika-xml-profiler-config.xml | 24 --
.../tika/config/TikaConfigSerializerTest.java | 4 +-
.../apache/tika/parser/crypto/TSDParserTest.java | 3 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 42 +--
.../serialization/ParseContextDeserializer.java | 2 +-
.../standard/UnpackerResourceWithConfigTest.java | 4 +-
41 files changed, 540 insertions(+), 1572 deletions(-)
diff --git a/tika-app/src/main/resources/tika-config-default-single-file.json
b/tika-app/src/main/resources/tika-config-default-single-file.json
index 696a8f6414..e9af227964 100644
--- a/tika-app/src/main/resources/tika-config-default-single-file.json
+++ b/tika-app/src/main/resources/tika-config-default-single-file.json
@@ -7,10 +7,7 @@
"pdf-parser": {
"extractActions": true,
"extractInlineImages": true,
- "accessChecker": {
- "needToCheck": true,
- "allowExtractionForAccessibility": true
- },
+ "accessCheckMode": "ALLOW_EXTRACTION_FOR_ACCESSIBILITY",
"extractIncrementalUpdateInfo": true,
"parseIncrementalUpdates":true
diff --git
a/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java
b/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java
index e64b9ef82c..18ab2ff12a 100644
--- a/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java
+++ b/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java
@@ -62,6 +62,11 @@ public class ConfigDeserializer {
Method method = null;
try {
clazz =
Class.forName("com.fasterxml.jackson.databind.ObjectMapper");
+ // Use a plain ObjectMapper for simple config deserialization.
+ // The polymorphic mapper from tika-serialization is meant for
ParseContext
+ // serialization with actual polymorphic types, not for simple
config classes.
+ //TODO -- we need to revisit this. We should be using the same
object mapper for
+ //config files and for runtime configs
instance = clazz.getDeclaredConstructor().newInstance();
method = clazz.getMethod("readValue", String.class, Class.class);
} catch (Exception e) {
diff --git
a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/tika-config-kafka.xml
b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/tika-config-kafka.xml
deleted file mode 100644
index fb29c9ad6a..0000000000
---
a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/tika-config-kafka.xml
+++ /dev/null
@@ -1,72 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
- <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
- <parser-exclude
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
- <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
- </parser>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="extractActions" type="bool">true</param>
- <param name="accessChecker"
type="org.apache.tika.parser.pdf.AccessChecker">
- <params>
- <param name="needToCheck" type="bool">true</param>
- <param name="allowExtractionForAccessibility"
type="bool">true</param>
- </params>
- </param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
- <params>
- <param name="includeDeletedContent" type="bool">true</param>
- <param name="includeMoveFromContent" type="bool">true</param>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.OfficeParser">
- <params>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- </parsers>
- <metadataFilters>
- <!-- depending on the file format, some dates do not have a timezone. This
- filter arbitrarily assumes dates have a UTC timezone and will format
all
- dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a
timezone.
- -->
- <metadataFilter
class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/>
- <metadataFilter
class="org.apache.tika.metadata.filter.FieldNameMappingFilter">
- <params>
- <excludeUnmapped>true</excludeUnmapped>
- <mappings>
- <mapping from="X-TIKA:content" to="content_s"/>
- <mapping from="Content-Length" to="length_i"/>
- <mapping from="dc:creator" to="creators_ss"/>
- <mapping from="dc:title" to="title_s"/>
- <mapping from="Content-Type" to="mime_s"/>
- <mapping from="X-TIKA:EXCEPTION:container_exception"
to="tika_exception_s"/>
- </mappings>
- </params>
- </metadataFilter>
- </metadataFilters>
-</properties>
diff --git
a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/tika-config-kafka.xml
b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/tika-config-kafka.xml
deleted file mode 100644
index 820e1bc7b5..0000000000
---
a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/tika-config-kafka.xml
+++ /dev/null
@@ -1,123 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
- <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
- <parser-exclude
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
- <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
- </parser>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="extractActions" type="bool">true</param>
- <param name="accessChecker"
type="org.apache.tika.parser.pdf.AccessChecker">
- <params>
- <param name="needToCheck" type="bool">true</param>
- <param name="allowExtractionForAccessibility"
type="bool">true</param>
- </params>
- </param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
- <params>
- <param name="includeDeletedContent" type="bool">true</param>
- <param name="includeMoveFromContent" type="bool">true</param>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.OfficeParser">
- <params>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- </parsers>
- <metadataFilters>
- <!-- depending on the file format, some dates do not have a timezone. This
- filter arbitrarily assumes dates have a UTC timezone and will format
all
- dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a
timezone.
- -->
- <metadataFilter
class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/>
- <metadataFilter
class="org.apache.tika.metadata.filter.FieldNameMappingFilter">
- <params>
- <excludeUnmapped>true</excludeUnmapped>
- <mappings>
- <mapping from="X-TIKA:content" to="content_s"/>
- <mapping from="Content-Length" to="length_i"/>
- <mapping from="dc:creator" to="creators_ss"/>
- <mapping from="dc:title" to="title_s"/>
- <mapping from="Content-Type" to="mime_s"/>
- <mapping from="X-TIKA:EXCEPTION:container_exception"
to="tika_exception_s"/>
- </mappings>
- </params>
- </metadataFilter>
- </metadataFilters>
- <async>
- <params>
- <directEmitThresholdBytes>10000</directEmitThresholdBytes>
- <emitMaxEstimatedBytes>100000</emitMaxEstimatedBytes>
- <emitWithinMillis>10</emitWithinMillis>
- <numEmitters>1</numEmitters>
- <numClients>1</numClients>
- <tikaConfig>{TIKA_CONFIG}</tikaConfig>
- <forkedJvmArgs>
- <arg>-Xmx1g</arg>
- <arg>-XX:ParallelGCThreads=2</arg>
- <arg>-XX:+ExitOnOutOfMemoryError</arg>
- <arg>-Dlog4j.configurationFile={LOG4J_PROPERTIES_FILE}</arg>
- </forkedJvmArgs>
- <timeoutMillis>60000</timeoutMillis>
- </params>
- </async>
- <fetchers>
- <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher">
- <params>
- <name>fsf</name>
- <basePath>{PATH_TO_DOCS}</basePath>
- </params>
- </fetcher>
- </fetchers>
- <emitters>
- <emitter class="org.apache.tika.pipes.emitter.kafka.KafkaEmitter">
- <params>
- <name>ke</name>
- <topic>{EMITTER_TOPIC}</topic>
- <bootstrapServers>{BOOTSTRAP_SERVERS}</bootstrapServers>
- </params>
- </emitter>
- <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter">
- <params>
- <name>fse</name>
- <basePath>/path/to/extracts</basePath>
- </params>
- </emitter>
- </emitters>
- <pipesIterator
class="org.apache.tika.pipes.iterator.kafka.KafkaPipesIterator">
- <params>
- <topic>{PIPE_ITERATOR_TOPIC}</topic>
- <bootstrapServers>{BOOTSTRAP_SERVERS}</bootstrapServers>
- <groupId>grpid</groupId>
- <autoOffsetReset>earliest</autoOffsetReset>
- <pollDelayMs>1000</pollDelayMs>
- <fetcherName>fsf</fetcherName>
- <emitterName>ke</emitterName>
- </params>
- </pipesIterator>
-</properties>
\ No newline at end of file
diff --git
a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json
b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json
index 2381bca4ee..4a08cfa27d 100644
---
a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json
+++
b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json
@@ -6,10 +6,7 @@
{
"pdf-parser": {
"extractActions": true,
- "accessChecker": {
- "needToCheck": true,
- "allowExtractionForAccessibility": true
- }
+ "accessCheckMode": "ALLOW_EXTRACTION_FOR_ACCESSIBILITY"
}
},
{
diff --git
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json
index 017a047b95..043da2349f 100644
---
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json
+++
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json
@@ -15,10 +15,7 @@
{
"pdf-parser": {
"extractActions": true,
- "accessChecker": {
- "needToCheck": true,
- "allowExtractionForAccessibility": true
- }
+ "accessCheckMode": "ALLOW_EXTRACTION_FOR_ACCESSIBILITY"
}
},
{
diff --git
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.xml
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.xml
deleted file mode 100644
index c151bf907f..0000000000
---
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.xml
+++ /dev/null
@@ -1,68 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
- <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
- <parser-exclude
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
- <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
- </parser>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="extractActions" type="bool">true</param>
- <param name="accessChecker"
type="org.apache.tika.parser.pdf.AccessChecker">
- <params>
- <param name="needToCheck" type="bool">true</param>
- <param name="allowExtractionForAccessibility"
type="bool">true</param>
- </params>
- </param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
- <params>
- <param name="includeDeletedContent" type="bool">true</param>
- <param name="includeMoveFromContent" type="bool">true</param>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.OfficeParser">
- <params>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- </parsers>
- <metadataFilters>
- <metadataFilter
class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/>
- <metadataFilter
class="org.apache.tika.metadata.filter.FieldNameMappingFilter">
- <params>
- <excludeUnmapped>true</excludeUnmapped>
- <mappings>
- <mapping from="X-TIKA:content" to="content_s"/>
- <mapping from="Content-Length" to="length_i"/>
- <mapping from="dc:creator" to="creators_ss"/>
- <mapping from="dc:title" to="title_s"/>
- <mapping from="Content-Type" to="mime_s"/>
- <mapping from="X-TIKA:EXCEPTION:container_exception"
to="tika_exception_s"/>
- </mappings>
- </params>
- </metadataFilter>
- </metadataFilters>
-</properties>
diff --git
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3-integration-test.xml
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3-integration-test.xml
deleted file mode 100644
index 7b361483c8..0000000000
---
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3-integration-test.xml
+++ /dev/null
@@ -1,121 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
- <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
- <parser-exclude
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
- <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
- </parser>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="extractActions" type="bool">true</param>
- <param name="accessChecker"
type="org.apache.tika.parser.pdf.AccessChecker">
- <params>
- <param name="needToCheck" type="bool">true</param>
- <param name="allowExtractionForAccessibility"
type="bool">true</param>
- </params>
- </param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
- <params>
- <param name="includeDeletedContent" type="bool">true</param>
- <param name="includeMoveFromContent" type="bool">true</param>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.OfficeParser">
- <params>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- </parsers>
- <metadataFilters>
- <!-- depending on the file format, some dates do not have a timezone. This
- filter arbitrarily assumes dates have a UTC timezone and will format
all
- dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a
timezone.
- -->
- <metadataFilter
class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/>
- <metadataFilter
class="org.apache.tika.metadata.filter.FieldNameMappingFilter">
- <excludeUnmapped>true</excludeUnmapped>
- <mappings>
- <mapping from="X-TIKA:content" to="content_s"/>
- <mapping from="Content-Length" to="length_i"/>
- <mapping from="dc:creator" to="creators_ss"/>
- <mapping from="dc:title" to="title_s"/>
- <mapping from="Content-Type" to="mime_s"/>
- <mapping from="X-TIKA:EXCEPTION:container_exception"
to="tika_exception_s"/>
- </mappings>
- </metadataFilter>
- </metadataFilters>
- <async>
- <directEmitThresholdBytes>10000</directEmitThresholdBytes>
- <emitMaxEstimatedBytes>100000</emitMaxEstimatedBytes>
- <emitWithinMillis>10</emitWithinMillis>
- <numEmitters>1</numEmitters>
- <numClients>1</numClients>
- <tikaConfig>{TIKA_CONFIG}</tikaConfig>
- <forkedJvmArgs>
- <arg>-Xmx1g</arg>
- <arg>-XX:ParallelGCThreads=2</arg>
- <arg>-XX:+ExitOnOutOfMemoryError</arg>
- <arg>-Dlog4j.configurationFile={LOG4J_PROPERTIES_FILE}</arg>
- </forkedJvmArgs>
- <timeoutMillis>60000</timeoutMillis>
- </async>
- <fetchers>
- <fetcher class="org.apache.tika.pipes.fetcher.s3.S3Fetcher">
- <name>s3f</name>
- <region>{REGION}</region>
- <bucket>{FETCH_BUCKET}</bucket>
- <credentialsProvider>key_secret</credentialsProvider>
- <accessKey>{ACCESS_KEY}</accessKey>
- <secretKey>{SECRET_KEY}</secretKey>
-
<endpointConfigurationService>{ENDPOINT_CONFIGURATION_SERVICE}</endpointConfigurationService>
- <pathStyleAccessEnabled>true</pathStyleAccessEnabled>
- <throttleSeconds>30,120,600,1200</throttleSeconds>
- </fetcher>
- </fetchers>
- <pipesIterator class="org.apache.tika.pipes.iterator.S3PipesIterator">
- <emitterName>s3e</emitterName>
- <fetcherName>s3f</fetcherName>
- <region>{REGION}</region>
- <bucket>{PIPE_ITERATOR_BUCKET}</bucket>
- <credentialsProvider>key_secret</credentialsProvider>
- <accessKey>{ACCESS_KEY}</accessKey>
- <secretKey>{SECRET_KEY}</secretKey>
-
<endpointConfigurationService>{ENDPOINT_CONFIGURATION_SERVICE}</endpointConfigurationService>
- <pathStyleAccessEnabled>true</pathStyleAccessEnabled>
- </pipesIterator>
- <emitters>
- <emitter class="org.apache.tika.pipes.emitter.s3.S3Emitter">
- <name>s3e</name>
- <region>{REGION}</region>
- <bucket>{EMIT_BUCKET}</bucket>
- <credentialsProvider>key_secret</credentialsProvider>
- <accessKey>{ACCESS_KEY}</accessKey>
- <secretKey>{SECRET_KEY}</secretKey>
-
<endpointConfigurationService>{ENDPOINT_CONFIGURATION_SERVICE}</endpointConfigurationService>
- <pathStyleAccessEnabled>true</pathStyleAccessEnabled>
- </emitter>
- </emitters>
-</properties>
\ No newline at end of file
diff --git
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3ToFs.xml
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3ToFs.xml
deleted file mode 100644
index b02906f5d4..0000000000
---
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3ToFs.xml
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<properties>
- <fetchers>
- <fetcher class="org.apache.tika.pipes.fetcher.s3.S3Fetcher">
- <name>s3f</name>
- <region>us-east-1</region>
- <profile>default</profile>
- <bucket><!-- fill in here --></bucket>
- <credentialsProvider>profile</credentialsProvider>
- </fetcher>
- </fetchers>
- <pipesIterator class="org.apache.tika.pipes.iterator.S3PipesIterator">
- <fetcherName>s3f</fetcherName>
- <bucket><!-- fill in here --></bucket>
- <region>us-east-1</region>
- <profile>default</profile>
- <credentialsProvider>profile</credentialsProvider>
- </pipesIterator>
-</properties>
\ No newline at end of file
diff --git
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3Tos3.xml
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3Tos3.xml
deleted file mode 100644
index fc30a56634..0000000000
---
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/tika-config-s3Tos3.xml
+++ /dev/null
@@ -1,47 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<properties>
- <fetchers>
- <fetcher class="org.apache.tika.pipes.fetcher.s3.S3Fetcher">
- <name>s3f</name>
- <region>us-east-1</region>
- <bucket><!-- fill in here --></bucket>
- <profile>default</profile>
- <credentialsProvider>profile</credentialsProvider>
- </fetcher>
- </fetchers>
- <pipesIterator class="org.apache.tika.pipes.iterator.S3PipesIterator">
- <fetcherName>s3f</fetcherName>
- <region>us-east-1</region>
- <bucket><!-- fill in here --></bucket>
- <profile>default</profile>
- <credentialsProvider>profile</credentialsProvider>
- </pipesIterator>
- <emitters>
- <emitter class="org.apache.tika.pipes.emitter.s3.S3Emitter">
- <name>s3e</name>
- <region>us-east-1</region>
- <bucket><!-- fill in here --></bucket>
- <profile>default</profile>
- <fileExtension></fileExtension>
- <credentialsProvider>profile</credentialsProvider>
- </emitter>
- </emitters>
-</properties>
\ No newline at end of file
diff --git
a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/tika-config-solr.xml
b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/tika-config-solr.xml
deleted file mode 100644
index 53ad77b463..0000000000
---
a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/tika-config-solr.xml
+++ /dev/null
@@ -1,70 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
- <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
- <parser-exclude
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
- <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
- </parser>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="extractActions" type="bool">true</param>
- <param name="accessChecker"
type="org.apache.tika.parser.pdf.AccessChecker">
- <params>
- <param name="needToCheck" type="bool">true</param>
- <param name="allowExtractionForAccessibility"
type="bool">true</param>
- </params>
- </param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
- <params>
- <param name="includeDeletedContent" type="bool">true</param>
- <param name="includeMoveFromContent" type="bool">true</param>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.OfficeParser">
- <params>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- </parsers>
- <metadataFilters>
- <!-- depending on the file format, some dates do not have a timezone. This
- filter arbitrarily assumes dates have a UTC timezone and will format
all
- dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a
timezone.
- -->
- <metadataFilter
class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/>
- <metadataFilter
class="org.apache.tika.metadata.filter.FieldNameMappingFilter">
- <excludeUnmapped>true</excludeUnmapped>
- <mappings>
- <mapping from="X-TIKA:content" to="content_s"/>
- <mapping from="Content-Length" to="length_i"/>
- <mapping from="dc:creator" to="creators_ss"/>
- <mapping from="dc:title" to="title_s"/>
- <mapping from="Content-Type" to="mime_s"/>
- <mapping from="X-TIKA:EXCEPTION:container_exception"
to="tika_exception_s"/>
- </mappings>
- </metadataFilter>
- </metadataFilters>
-</properties>
diff --git
a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.json
b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.json
index ea91e6c806..375cd94cc6 100644
---
a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.json
+++
b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.json
@@ -6,10 +6,7 @@
{
"pdf-parser": {
"extractActions": true,
- "accessChecker": {
- "needToCheck": true,
- "allowExtractionForAccessibility": true
- }
+ "accessCheckMode": "ALLOW_EXTRACTION_FOR_ACCESSIBILITY"
}
},
{
diff --git
a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml
b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml
deleted file mode 100644
index ba17c705cd..0000000000
---
a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml
+++ /dev/null
@@ -1,120 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
- <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
- <parser-exclude
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
- <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
- </parser>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="extractActions" type="bool">true</param>
- <param name="accessChecker"
type="org.apache.tika.parser.pdf.AccessChecker">
- <params>
- <param name="needToCheck" type="bool">true</param>
- <param name="allowExtractionForAccessibility"
type="bool">true</param>
- </params>
- </param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
- <params>
- <param name="includeDeletedContent" type="bool">true</param>
- <param name="includeMoveFromContent" type="bool">true</param>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.OfficeParser">
- <params>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- </parsers>
- <metadataFilters>
- <!-- depending on the file format, some dates do not have a timezone. This
- filter arbitrarily assumes dates have a UTC timezone and will format
all
- dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a
timezone.
- -->
- <metadataFilter
class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/>
- <metadataFilter
class="org.apache.tika.metadata.filter.FieldNameMappingFilter">
- <excludeUnmapped>true</excludeUnmapped>
- <mappings>
- <mapping from="X-TIKA:content" to="content_s"/>
- <mapping from="Content-Length" to="length_i"/>
- <mapping from="dc:creator" to="creators_ss"/>
- <mapping from="dc:title" to="title_s"/>
- <mapping from="Content-Type" to="mime_s"/>
- <mapping from="X-TIKA:EXCEPTION:container_exception"
to="tika_exception_s"/>
- </mappings>
- </metadataFilter>
- </metadataFilters>
- <async>
- <directEmitThresholdBytes>10000</directEmitThresholdBytes>
- <emitMaxEstimatedBytes>100000</emitMaxEstimatedBytes>
- <emitWithinMillis>10</emitWithinMillis>
- <numEmitters>1</numEmitters>
- <numClients>1</numClients>
- <tikaConfig>{TIKA_CONFIG}</tikaConfig>
- <forkedJvmArgs>
- <arg>-Xmx1g</arg>
- <arg>-XX:ParallelGCThreads=2</arg>
- <arg>-XX:+ExitOnOutOfMemoryError</arg>
- <arg>-Dlog4j.configurationFile={LOG4J_PROPERTIES_FILE}</arg>
- </forkedJvmArgs>
- <timeoutMillis>60000</timeoutMillis>
- </async>
- <fetchers>
- <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher">
- <name>fsf</name>
- <basePath>{PATH_TO_DOCS}</basePath>
- </fetcher>
- </fetchers>
- <emitters>
- <emitter class="org.apache.tika.pipes.emitter.solr.SolrEmitter">
- <name>se</name>
- {SOLR_CONNECTION}
- <updateStrategy>{UPDATE_STRATEGY}</updateStrategy>
- <solrCollection>testcol</solrCollection>
- <attachmentStrategy>{ATTACHMENT_STRATEGY}</attachmentStrategy>
- <commitWithin>1</commitWithin>
- <idField>id</idField>
- <connectionTimeout>10000</connectionTimeout>
- <socketTimeout>60000</socketTimeout>
- </emitter>
- <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter">
- <name>fse</name>
- <basePath>/path/to/extracts</basePath>
- </emitter>
- </emitters>
- <pipesIterator class="org.apache.tika.pipes.emitter.solr.SolrPipesIterator">
- <solrCollection>testcol</solrCollection>
- {SOLR_CONNECTION}
- <idField>id</idField>
- <parsingIdField>parsing_id_i</parsingIdField>
- <failCountField>fail_count_i</failCountField>
- <sizeFieldName>size_i</sizeFieldName>
- <parseMode>{PARSE_MODE}</parseMode>
- <rows>100</rows>
- <fetcherName>fsf</fetcherName>
- <emitterName>se</emitterName>
- </pipesIterator>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
index 8ceb9db51c..1fcf2e76e4 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
@@ -87,6 +87,12 @@
<artifactId>jai-imageio-core</artifactId>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-serialization</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 0d950ec0e0..2d3ee9839a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -16,10 +16,10 @@
*/
package org.apache.tika.parser.pdf;
-import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.AUTO;
-import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
-import static
org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION;
-import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.OCR_ONLY;
+import static org.apache.tika.parser.pdf.OcrConfig.Strategy.AUTO;
+import static org.apache.tika.parser.pdf.OcrConfig.Strategy.NO_OCR;
+import static
org.apache.tika.parser.pdf.OcrConfig.Strategy.OCR_AND_TEXT_EXTRACTION;
+import static org.apache.tika.parser.pdf.OcrConfig.Strategy.OCR_ONLY;
import java.awt.image.BufferedImage;
import java.io.BufferedInputStream;
@@ -530,7 +530,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
- void doOCROnCurrentPage(PDPage pdPage, PDFParserConfig.OCR_STRATEGY
ocrStrategy)
+ void doOCROnCurrentPage(PDPage pdPage, OcrConfig.Strategy ocrStrategy)
throws IOException, TikaException, SAXException {
if (ocrStrategy.equals(NO_OCR)) {
//I don't think this is reachable?
@@ -541,7 +541,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (c != null) {
c.increment();
}
- MediaType ocrImageMediaType = MediaType.image("ocr-" +
config.getOcrImageFormatName());
+ MediaType ocrImageMediaType = MediaType.image("ocr-" +
config.getOcrImageFormat().getFormatName());
if (!ocrParser.getSupportedTypes(context).contains(ocrImageMediaType))
{
if (ocrStrategy == OCR_ONLY || ocrStrategy ==
OCR_AND_TEXT_EXTRACTION) {
throw new TikaException(
@@ -597,7 +597,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
Renderer thisRenderer = getPDFRenderer(renderer);
//if there's a configured renderer and if the rendering strategy is
"all"
if (thisRenderer != null &&
- config.getOcrRenderingStrategy() ==
PDFParserConfig.OCR_RENDERING_STRATEGY.ALL) {
+ config.getOcrRenderingStrategy() ==
OcrConfig.RenderingStrategy.ALL) {
PageRangeRequest pageRangeRequest =
new PageRangeRequest(getCurrentPageNo(),
getCurrentPageNo());
if (thisRenderer instanceof PDDocumentRenderer) {
@@ -673,13 +673,13 @@ class AbstractPDF2XHTML extends PDFTextStripper {
try {
BufferedImage image =
- renderer.renderImageWithDPI(pageIndex, dpi,
config.getOcrImageType().getImageType());
+ renderer.renderImageWithDPI(pageIndex, dpi,
config.getOcrImageType().getPdfBoxImageType());
//TODO -- get suffix based on OcrImageType
tmpFile = tmpResources.createTempFile();
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
- ImageIOUtil.writeImage(image, config.getOcrImageFormatName(),
os, dpi,
+ ImageIOUtil.writeImage(image,
config.getOcrImageFormat().getFormatName(), os, dpi,
config.getOcrImageQuality());
}
} catch (SecurityException e) {
@@ -707,9 +707,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
for (PDAnnotation annotation : page.getAnnotations()) {
processPageAnnotation(annotation);
}
- if (config.getOcrStrategy() ==
PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION) {
+ if (config.getOcrStrategy() == OCR_AND_TEXT_EXTRACTION) {
doOCROnCurrentPage(page, OCR_AND_TEXT_EXTRACTION);
- } else if (config.getOcrStrategy() ==
PDFParserConfig.OCR_STRATEGY.AUTO) {
+ } else if (config.getOcrStrategy() == AUTO) {
boolean unmappedExceedsLimit = false;
if (totalCharsPerPage >
config.getOcrStrategyAuto().getTotalCharsPerPage()) {
// There are enough characters to not have to do OCR.
Check number of unmapped characters
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
deleted file mode 100644
index 4cf307a763..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pdf;
-
-import java.io.Serializable;
-
-import org.apache.tika.exception.AccessPermissionException;
-import org.apache.tika.metadata.AccessPermissions;
-import org.apache.tika.metadata.Metadata;
-
-/**
- * Checks whether or not a document allows extraction generally
- * or extraction for accessibility only.
- */
-public class AccessChecker implements Serializable {
-
- private static final long serialVersionUID = 6492570218190936986L;
-
- private boolean needToCheck;
- private boolean allowExtractionForAccessibility;
-
- /**
- * This constructs an {@link AccessChecker} that
- * will not perform any checking and will always return without
- * throwing an exception.
- * <p/>
- * This constructor is available to allow for Tika's legacy (<= v1.7)
behavior.
- */
- public AccessChecker() {
- needToCheck = false;
- allowExtractionForAccessibility = true;
- }
-
- /**
- * This constructs an {@link AccessChecker} that will check
- * for whether or not content should be extracted from a document.
- *
- * @param allowExtractionForAccessibility if general extraction is
- * not allowed, is extraction for
accessibility allowed
- */
- public AccessChecker(boolean allowExtractionForAccessibility) {
- needToCheck = true;
- this.allowExtractionForAccessibility = allowExtractionForAccessibility;
- }
-
- public boolean isNeedToCheck() {
- return needToCheck;
- }
-
- public void setNeedToCheck(boolean needToCheck) {
- this.needToCheck = needToCheck;
- }
-
- public boolean isAllowExtractionForAccessibility() {
- return allowExtractionForAccessibility;
- }
-
- public void setAllowExtractionForAccessibility(boolean
allowExtractionForAccessibility) {
- this.allowExtractionForAccessibility = allowExtractionForAccessibility;
- }
-
- /**
- * Checks to see if a document's content should be extracted based
- * on metadata values and the value of {@link
#allowExtractionForAccessibility} in the constructor.
- *
- * @param metadata
- * @throws AccessPermissionException if access is not permitted
- */
- public void check(Metadata metadata) throws AccessPermissionException {
- if (!needToCheck) {
- return;
- }
- if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) {
- if (allowExtractionForAccessibility) {
- if
("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) {
- return;
- }
- throw new AccessPermissionException(
- "Content extraction for accessibility is not
allowed.");
- }
- throw new AccessPermissionException("Content extraction is not
allowed.");
- }
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- AccessChecker checker = (AccessChecker) o;
-
- if (needToCheck != checker.needToCheck) {
- return false;
- }
- return allowExtractionForAccessibility ==
checker.allowExtractionForAccessibility;
-
- }
-
- @Override
- public int hashCode() {
- int result = (needToCheck ? 1 : 0);
- result = 31 * result + (allowExtractionForAccessibility ? 1 : 0);
- return result;
- }
-}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
index 26a9d33151..8eff5c597e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -94,7 +94,7 @@ class OCR2XHTML extends AbstractPDF2XHTML {
public void processPage(PDPage pdPage) throws IOException {
try {
startPage(pdPage);
- doOCROnCurrentPage(pdPage, PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
+ doOCROnCurrentPage(pdPage, OcrConfig.Strategy.OCR_ONLY);
endPage(pdPage);
} catch (TikaException | SAXException e) {
throw new IOException(e);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java
new file mode 100644
index 0000000000..f0c56198c9
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.Serializable;
+import java.util.Locale;
+
+/**
+ * Configuration for OCR processing in PDF parsing.
+ * Groups all OCR-related settings together.
+ */
+public class OcrConfig implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ public enum Strategy {
+ AUTO,
+ NO_OCR,
+ OCR_ONLY,
+ OCR_AND_TEXT_EXTRACTION
+ }
+
+ public enum RenderingStrategy {
+ NO_TEXT,
+ TEXT_ONLY,
+ VECTOR_GRAPHICS_ONLY,
+ ALL
+ }
+
+ public enum ImageFormat {
+ PNG, TIFF, JPEG;
+
+ public String getFormatName() {
+ return name().toLowerCase(Locale.ROOT);
+ }
+ }
+
+ public enum ImageType {
+ RGB(org.apache.pdfbox.rendering.ImageType.RGB),
+ GRAY(org.apache.pdfbox.rendering.ImageType.GRAY);
+
+ private final org.apache.pdfbox.rendering.ImageType pdfBoxImageType;
+
+ ImageType(org.apache.pdfbox.rendering.ImageType pdfBoxImageType) {
+ this.pdfBoxImageType = pdfBoxImageType;
+ }
+
+ public org.apache.pdfbox.rendering.ImageType getPdfBoxImageType() {
+ return pdfBoxImageType;
+ }
+ }
+
+ /**
+ * Configuration for AUTO strategy behavior.
+ * Controls when OCR is triggered based on character analysis.
+ */
+ public static class StrategyAuto implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ public static final StrategyAuto BETTER = new StrategyAuto(10, 10);
+ public static final StrategyAuto FASTER = new StrategyAuto(0.1f, 10);
+
+ private float unmappedUnicodeCharsPerPage;
+ private int totalCharsPerPage;
+
+ public StrategyAuto() {
+ this(10, 10);
+ }
+
+ public StrategyAuto(float unmappedUnicodeCharsPerPage, int
totalCharsPerPage) {
+ this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage;
+ this.totalCharsPerPage = totalCharsPerPage;
+ }
+
+ public float getUnmappedUnicodeCharsPerPage() {
+ return unmappedUnicodeCharsPerPage;
+ }
+
+ public void setUnmappedUnicodeCharsPerPage(float
unmappedUnicodeCharsPerPage) {
+ this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage;
+ }
+
+ public int getTotalCharsPerPage() {
+ return totalCharsPerPage;
+ }
+
+ public void setTotalCharsPerPage(int totalCharsPerPage) {
+ this.totalCharsPerPage = totalCharsPerPage;
+ }
+
+ @Override
+ public String toString() {
+ String unmappedString;
+ if (unmappedUnicodeCharsPerPage < 1.0) {
+ unmappedString = String.format(Locale.US, "%.03f",
+ unmappedUnicodeCharsPerPage * 100) + "%";
+ } else {
+ unmappedString = String.format(Locale.US, "%.0f",
unmappedUnicodeCharsPerPage);
+ }
+ return unmappedString + "," + totalCharsPerPage;
+ }
+ }
+
+ private Strategy strategy = Strategy.AUTO;
+ private StrategyAuto strategyAuto = StrategyAuto.BETTER;
+ private RenderingStrategy renderingStrategy = RenderingStrategy.ALL;
+ private int dpi = 300;
+ private ImageType imageType = ImageType.GRAY;
+ private ImageFormat imageFormat = ImageFormat.PNG;
+ private float imageQuality = 1.0f;
+
+ public Strategy getStrategy() {
+ return strategy;
+ }
+
+ public void setStrategy(Strategy strategy) {
+ this.strategy = strategy;
+ }
+
+ public StrategyAuto getStrategyAuto() {
+ return strategyAuto;
+ }
+
+ public void setStrategyAuto(StrategyAuto strategyAuto) {
+ this.strategyAuto = strategyAuto;
+ }
+
+ public RenderingStrategy getRenderingStrategy() {
+ return renderingStrategy;
+ }
+
+ public void setRenderingStrategy(RenderingStrategy renderingStrategy) {
+ this.renderingStrategy = renderingStrategy;
+ }
+
+ public int getDpi() {
+ return dpi;
+ }
+
+ public void setDpi(int dpi) {
+ this.dpi = dpi;
+ }
+
+ public ImageType getImageType() {
+ return imageType;
+ }
+
+ public void setImageType(ImageType imageType) {
+ this.imageType = imageType;
+ }
+
+ public ImageFormat getImageFormat() {
+ return imageFormat;
+ }
+
+ public void setImageFormat(ImageFormat imageFormat) {
+ this.imageFormat = imageFormat;
+ }
+
+ public float getImageQuality() {
+ return imageQuality;
+ }
+
+ public void setImageQuality(float imageQuality) {
+ this.imageQuality = imageQuality;
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 81e5fa8fe6..848e94f403 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -66,6 +66,7 @@ import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.Param;
import org.apache.tika.config.ParseContextConfig;
import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.AccessPermissionException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
@@ -225,14 +226,13 @@ public class PDFParser implements Parser,
RenderingParser, Initializable {
extractMetadata(pdfDocument, metadata, context);
extractSignatures(pdfDocument, metadata);
checkIllustrator(pdfDocument, metadata);
- AccessChecker checker = localConfig.getAccessChecker();
- checker.check(metadata);
+ checkAccessPermissions(localConfig.getAccessCheckMode(), metadata);
renderPagesBeforeParse(tstream, handler, metadata, context,
localConfig);
if (handler != null) {
if (shouldHandleXFAOnly(hasXFA, localConfig)) {
handleXFAOnly(pdfDocument, handler, metadata, context);
} else if (localConfig.getOcrStrategy()
- .equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
+ .equals(OcrConfig.Strategy.OCR_ONLY)) {
OCR2XHTML.process(pdfDocument, handler, context, metadata,
localConfig, renderer);
} else if (hasMarkedContent &&
localConfig.isExtractMarkedContent()) {
@@ -399,6 +399,25 @@ public class PDFParser implements Parser, RenderingParser,
Initializable {
//COSStream aiMetaData =
privateDict.getCOSStream(COSName.AI_META_DATA);
}
+ private void checkAccessPermissions(PDFParserConfig.AccessCheckMode mode,
Metadata metadata)
+ throws AccessPermissionException {
+ if (mode == PDFParserConfig.AccessCheckMode.DONT_CHECK) {
+ return;
+ }
+
+ if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) {
+ if (mode ==
PDFParserConfig.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY) {
+ if
("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) {
+ return;
+ }
+ throw new AccessPermissionException(
+ "Content extraction for accessibility is not
allowed.");
+ }
+ // IGNORE_ACCESSIBILITY_ALLOWANCE - don't extract even if
accessibility is allowed
+ throw new AccessPermissionException("Content extraction is not
allowed.");
+ }
+ }
+
private void extractSignatures(PDDocument pdfDocument, Metadata metadata) {
boolean hasSignature = false;
for (PDSignature signature : pdfDocument.getSignatureDictionaries()) {
@@ -434,7 +453,7 @@ public class PDFParser implements Parser, RenderingParser,
Initializable {
return true;
}
- if (localConfig.getOcrStrategy() ==
PDFParserConfig.OCR_STRATEGY.NO_OCR) {
+ if (localConfig.getOcrStrategy() == OcrConfig.Strategy.NO_OCR) {
return false;
}
//TODO: test that this is not AUTO with no OCR parser installed
@@ -842,38 +861,38 @@ public class PDFParser implements Parser,
RenderingParser, Initializable {
}
@Field
- public void setOcrStrategy(PDFParserConfig.OCR_STRATEGY ocrStrategy) {
+ public void setOcrStrategy(OcrConfig.Strategy ocrStrategy) {
defaultConfig.setOcrStrategy(ocrStrategy);
}
- public PDFParserConfig.OCR_STRATEGY getOcrStrategy() {
+ public OcrConfig.Strategy getOcrStrategy() {
return defaultConfig.getOcrStrategy();
}
@Field
- public void setOcrStrategyAuto(String ocrStrategyAuto) {
- defaultConfig.setOcrStrategyAutoFromString(ocrStrategyAuto);
+ public void setOcrStrategyAuto(OcrConfig.StrategyAuto ocrStrategyAuto) {
+ defaultConfig.setOcrStrategyAuto(ocrStrategyAuto);
}
- public String getOcrStrategyAuto() {
- return defaultConfig.getOcrStrategyAuto().toString();
+ public OcrConfig.StrategyAuto getOcrStrategyAuto() {
+ return defaultConfig.getOcrStrategyAuto();
}
@Field
- public void setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY
ocrRenderingStrategy) {
+ public void setOcrRenderingStrategy(OcrConfig.RenderingStrategy
ocrRenderingStrategy) {
defaultConfig.setOcrRenderingStrategy(ocrRenderingStrategy);
}
- public PDFParserConfig.OCR_RENDERING_STRATEGY getOcrRenderingStrategy() {
+ public OcrConfig.RenderingStrategy getOcrRenderingStrategy() {
return defaultConfig.getOcrRenderingStrategy();
}
@Field
- public void setOcrImageType(PDFParserConfig.TikaImageType ocrImageType) {
+ public void setOcrImageType(OcrConfig.ImageType ocrImageType) {
defaultConfig.setOcrImageType(ocrImageType);
}
- public PDFParserConfig.TikaImageType getOcrImageType() {
+ public OcrConfig.ImageType getOcrImageType() {
return defaultConfig.getOcrImageType();
}
@@ -895,12 +914,12 @@ public class PDFParser implements Parser,
RenderingParser, Initializable {
}
@Field
- public void setOcrImageFormatName(String formatName) {
- defaultConfig.setOcrImageFormatName(formatName);
+ public void setOcrImageFormat(OcrConfig.ImageFormat imageFormat) {
+ defaultConfig.setOcrImageFormat(imageFormat);
}
- public String getOcrImageFormatName() {
- return defaultConfig.getOcrImageFormatName();
+ public OcrConfig.ImageFormat getOcrImageFormat() {
+ return defaultConfig.getOcrImageFormat();
}
@Field
@@ -976,12 +995,12 @@ public class PDFParser implements Parser,
RenderingParser, Initializable {
return defaultConfig.isIfXFAExtractOnlyXFA();
}
@Field
- public void setAllowExtractionForAccessibility(boolean
allowExtractionForAccessibility) {
- defaultConfig.setAccessChecker(new
AccessChecker(allowExtractionForAccessibility));
+ public void setAccessCheckMode(PDFParserConfig.AccessCheckMode mode) {
+ defaultConfig.setAccessCheckMode(mode);
}
- public boolean isAllowExtractionForAccessibility() {
- return
defaultConfig.getAccessChecker().isAllowExtractionForAccessibility();
+ public PDFParserConfig.AccessCheckMode getAccessCheckMode() {
+ return defaultConfig.getAccessCheckMode();
}
@Field
@@ -1146,8 +1165,8 @@ public class PDFParser implements Parser,
RenderingParser, Initializable {
//set a default renderer if nothing was defined
PDFBoxRenderer pdfBoxRenderer = new PDFBoxRenderer();
pdfBoxRenderer.setDPI(config.getOcrDPI());
- pdfBoxRenderer.setImageType(config.getOcrImageType().getImageType());
- pdfBoxRenderer.setImageFormatName(config.getOcrImageFormatName());
+
pdfBoxRenderer.setImageType(config.getOcrImageType().getPdfBoxImageType());
+
pdfBoxRenderer.setImageFormatName(config.getOcrImageFormat().getFormatName());
this.renderer = pdfBoxRenderer;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index fa64ad9161..c87acefcd3 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -17,11 +17,7 @@
package org.apache.tika.parser.pdf;
import java.io.Serializable;
-import java.util.Locale;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory;
@@ -38,20 +34,31 @@ import
org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory;
*/
public class PDFParserConfig implements Serializable {
- public enum TikaImageType {
- RGB(ImageType.RGB),
- GRAY(ImageType.GRAY);
+ private static final long serialVersionUID = 6492570218190936986L;
- private ImageType imageType;
- TikaImageType(ImageType imageType) {
- this.imageType = imageType;
- }
- public ImageType getImageType() {
- return imageType;
- }
- }
+ /**
+ * Mode for checking document access permissions.
+ */
+ public enum AccessCheckMode {
+ /**
+ * Don't check extraction permissions. Content will always be extracted
+ * regardless of document permissions. This is the default for
backwards
+ * compatibility with Tika's legacy behavior (<= v1.7).
+ */
+ DONT_CHECK,
- private static final long serialVersionUID = 6492570218190936986L;
+ /**
+ * Check permissions, but allow extraction for accessibility purposes
if
+ * extraction for accessibility is allowed.
+ */
+ ALLOW_EXTRACTION_FOR_ACCESSIBILITY,
+
+ /**
+ * If extraction is blocked, throw an {@link
org.apache.tika.exception.AccessPermissionException}
+ * even if the document allows extraction for accessibility.
+ */
+ IGNORE_ACCESSIBILITY_ALLOWANCE
+ }
// True if we let PDFBox "guess" where spaces should go:
private boolean enableAutoSpace = true;
@@ -110,27 +117,13 @@ public class PDFParserConfig implements Serializable {
//content from elsewhere in the document.
private boolean ifXFAExtractOnlyXFA = false;
- private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.AUTO;
-
- // If OCR_Strategy=AUTO, then this controls the algorithm used
- private static final OCRStrategyAuto OCR_STRATEGY_AUTO_BETTER = new
OCRStrategyAuto(10, 10);
- private static final OCRStrategyAuto OCR_STRATEGY_AUTO_FASTER = new
OCRStrategyAuto(.1f, 10);
- private static final int OCR_STRATEGY_AUTO_DEFAULT_CHARS_PER_PAGE = 10;
-
- private OCRStrategyAuto ocrStrategyAuto = OCR_STRATEGY_AUTO_BETTER;
-
- private OCR_RENDERING_STRATEGY ocrRenderingStrategy =
OCR_RENDERING_STRATEGY.ALL;
-
- private int ocrDPI = 300;
- private TikaImageType ocrImageType = TikaImageType.GRAY;
- private String ocrImageFormatName = "png";
- private float ocrImageQuality = 1.0f;
+ private OcrConfig ocr = new OcrConfig();
/**
* Should the entire document be rendered?
*/
private IMAGE_STRATEGY imageStrategy = IMAGE_STRATEGY.NONE;
- private AccessChecker accessChecker = new AccessChecker();
+ private AccessCheckMode accessCheckMode = AccessCheckMode.DONT_CHECK;
//The PDFParser can throw IOExceptions if there is a problem
//with a streams. If this is set to true, Tika's
@@ -483,12 +476,12 @@ public class PDFParserConfig implements Serializable {
this.dropThreshold = dropThreshold;
}
- public AccessChecker getAccessChecker() {
- return accessChecker;
+ public AccessCheckMode getAccessCheckMode() {
+ return accessCheckMode;
}
- public void setAccessChecker(AccessChecker accessChecker) {
- this.accessChecker = accessChecker;
+ public void setAccessCheckMode(AccessCheckMode accessCheckMode) {
+ this.accessCheckMode = accessCheckMode;
}
/**
@@ -514,172 +507,101 @@ public class PDFParserConfig implements Serializable {
}
/**
- * @return strategy to use for OCR
+ * @return the OCR configuration
*/
- public OCR_STRATEGY getOcrStrategy() {
- return ocrStrategy;
+ public OcrConfig getOcr() {
+ return ocr;
}
/**
- * @return ocr auto strategy to use when ocr_strategy = Auto
+ * @param ocr the OCR configuration
*/
- public OCRStrategyAuto getOcrStrategyAuto() {
- return ocrStrategyAuto;
+ public void setOcr(OcrConfig ocr) {
+ this.ocr = ocr;
}
/**
- * Which strategy to use for OCR
- *
- * @param ocrStrategy
+ * @return strategy to use for OCR
*/
- public void setOcrStrategy(OCR_STRATEGY ocrStrategy) {
- this.ocrStrategy = ocrStrategy;
+ public OcrConfig.Strategy getOcrStrategy() {
+ return ocr.getStrategy();
}
-
/**
- * Sets the OCR strategy auto configuration from an object.
- * Used by Jackson deserialization.
- *
- * @param ocrStrategyAuto the OCR strategy auto configuration
+ * @return ocr auto strategy to use when ocr_strategy = Auto
*/
- public void setOcrStrategyAuto(OCRStrategyAuto ocrStrategyAuto) {
- this.ocrStrategyAuto = ocrStrategyAuto;
+ public OcrConfig.StrategyAuto getOcrStrategyAuto() {
+ return ocr.getStrategyAuto();
}
/**
- * Sets the OCR strategy auto configuration from a string.
- * Used for configuration parsing from XML/text via PDFParser's @Field
annotation.
- * Package-private to prevent Jackson from discovering it during bean
introspection.
- *
- * @param ocrStrategyAuto string representation of OCR strategy
- */
- void setOcrStrategyAutoFromString(String ocrStrategyAuto) {
- final String regex =
"^\\s*(faster|better)|(\\d{1,3})(%)?(?:,\\s*(\\d{1,3}))?\\s*$";
- Pattern pattern = Pattern.compile(regex);
- Matcher matcher = pattern.matcher(ocrStrategyAuto);
- if (matcher.matches()) {
- final String group1 = matcher.group(1);
-
- if ("better".equals(group1)) {
- this.ocrStrategyAuto = OCR_STRATEGY_AUTO_BETTER;
- } else if ("faster".equals(group1)) {
- this.ocrStrategyAuto = OCR_STRATEGY_AUTO_FASTER;
- } else {
- float unmappedUnicodeCharsPerPage =
Integer.parseInt(matcher.group(2));
- if (matcher.group(3) != null) {
- // If we have the percent sign, then convert
- if (unmappedUnicodeCharsPerPage > 100.0) {
- throw new IllegalArgumentException
- ("Error parsing OCRStrategyAuto - Percent cannot
exceed 100%");
- }
- unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage
/ 100f;
- }
- // The 2nd number is optional. Default to 10 chars per page
- int totalCharsPerPage = matcher.group(4) == null
- ? OCR_STRATEGY_AUTO_DEFAULT_CHARS_PER_PAGE
- : Integer.parseInt(matcher.group(4));
- this.ocrStrategyAuto = new
OCRStrategyAuto(unmappedUnicodeCharsPerPage, totalCharsPerPage);
- }
+ * Which strategy to use for OCR
+ */
+ public void setOcrStrategy(OcrConfig.Strategy ocrStrategy) {
+ ocr.setStrategy(ocrStrategy);
+ }
- } else {
- throw new IllegalArgumentException("Error parsing OCRStrategyAuto
- Must be in the form 'num[%], num'");
- }
+ /**
+ * Sets the OCR strategy auto configuration.
+ */
+ public void setOcrStrategyAuto(OcrConfig.StrategyAuto ocrStrategyAuto) {
+ ocr.setStrategyAuto(ocrStrategyAuto);
}
- public OCR_RENDERING_STRATEGY getOcrRenderingStrategy() {
- return ocrRenderingStrategy;
+ public OcrConfig.RenderingStrategy getOcrRenderingStrategy() {
+ return ocr.getRenderingStrategy();
}
/**
* When rendering the page for OCR, do you want to include the rendering
of the electronic text,
* ALL, or do you only want to run OCR on the images and vector graphics
(NO_TEXT)?
- *
- * @param ocrRenderingStrategy
*/
- public void setOcrRenderingStrategy(OCR_RENDERING_STRATEGY
ocrRenderingStrategy) {
- this.ocrRenderingStrategy = ocrRenderingStrategy;
+ public void setOcrRenderingStrategy(OcrConfig.RenderingStrategy
ocrRenderingStrategy) {
+ ocr.setRenderingStrategy(ocrRenderingStrategy);
}
- /**
- * String representation of the image format used to render
- * the page image for OCR (examples: png, tiff, jpeg)
- *
- * @return
- */
- public String getOcrImageFormatName() {
- return ocrImageFormatName;
+ public OcrConfig.ImageFormat getOcrImageFormat() {
+ return ocr.getImageFormat();
}
- /**
- * @param ocrImageFormatName name of image format used to render
- * page image
- * @see #getOcrImageFormatName()
- */
- public void setOcrImageFormatName(String ocrImageFormatName) {
- if (!ocrImageFormatName.equals("png") &&
!ocrImageFormatName.equals("tiff") &&
- !ocrImageFormatName.equals("jpeg")) {
- throw new IllegalArgumentException(
- "Available options: png, tiff, jpeg. " + "I'm sorry, but I
don't recognize: " +
- ocrImageFormatName);
- }
- this.ocrImageFormatName = ocrImageFormatName;
+ public void setOcrImageFormat(OcrConfig.ImageFormat ocrImageFormat) {
+ ocr.setImageFormat(ocrImageFormat);
}
- /**
- * Image type used to render the page image for OCR.
- *
- * @return image type
- * @see #setOcrImageType(TikaImageType)
- */
- public TikaImageType getOcrImageType() {
- return ocrImageType;
+ public OcrConfig.ImageType getOcrImageType() {
+ return ocr.getImageType();
}
- /**
- * Image type used to render the page image for OCR.
- *
- * @param ocrImageType
- */
- public void setOcrImageType(TikaImageType ocrImageType) {
- this.ocrImageType = ocrImageType;
+ public void setOcrImageType(OcrConfig.ImageType ocrImageType) {
+ ocr.setImageType(ocrImageType);
}
/**
- * Dots per inch used to render the page image for OCR
- *
- * @return dots per inch
+ * @return dots per inch used to render the page image for OCR
*/
public int getOcrDPI() {
- return ocrDPI;
+ return ocr.getDpi();
}
/**
* Dots per inch used to render the page image for OCR.
- * This does not apply to all image formats.
- *
- * @param ocrDPI
*/
public void setOcrDPI(int ocrDPI) {
- this.ocrDPI = ocrDPI;
+ ocr.setDpi(ocrDPI);
}
/**
- * Image quality used to render the page image for OCR.
- * This does not apply to all image formats
- *
- * @return
+ * @return image quality used to render the page image for OCR
*/
public float getOcrImageQuality() {
- return ocrImageQuality;
+ return ocr.getImageQuality();
}
/**
* Image quality used to render the page image for OCR.
- * This does not apply to all image formats
*/
public void setOcrImageQuality(float ocrImageQuality) {
- this.ocrImageQuality = ocrImageQuality;
+ ocr.setImageQuality(ocrImageQuality);
}
/**
@@ -762,6 +684,21 @@ public class PDFParserConfig implements Serializable {
this.imageGraphicsEngineFactory = imageGraphicsEngineFactory;
}
+ /**
+ * EXPERT: Customize the class that handles inline images within a PDF
page.
+ * Use this setter when specifying the factory class name in JSON config.
+ *
+ * @param className fully qualified class name of an
ImageGraphicsEngineFactory implementation
+ */
+ public void setImageGraphicsEngineFactoryClass(String className) {
+ try {
+ Class<?> clazz = Class.forName(className);
+ this.imageGraphicsEngineFactory = (ImageGraphicsEngineFactory)
clazz.getDeclaredConstructor().newInstance();
+ } catch (Exception e) {
+ throw new RuntimeException("Failed to instantiate
ImageGraphicsEngineFactory: " + className, e);
+ }
+ }
+
public ImageGraphicsEngineFactory getImageGraphicsEngineFactory() {
return imageGraphicsEngineFactory;
}
@@ -808,78 +745,6 @@ public class PDFParserConfig implements Serializable {
return throwOnEncryptedPayload;
}
- public enum OCR_STRATEGY {
- AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION
- }
-
- /**
- * Encapsulate the numbers used to control OCR Strategy when set to auto
- * <p>
- * If the total characters on the page < this.totalCharsPerPage
- * or
- * total unmapped unicode characters on the page >
this.unmappedUnicodeCharsPerPage
- * then we will perform OCR on the page
- * <p>
- * If unamppedUnicodeCharsPerPage is an integer > 0, then we compare
absolute number of characters.
- * If it is a float < 1, then we assume it is a percentage and we compare
it to the
- * percentage of unmappedCharactersPerPage/totalCharsPerPage
- */
- public static class OCRStrategyAuto implements Serializable {
- private float unmappedUnicodeCharsPerPage;
- private int totalCharsPerPage;
-
- /**
- * No-arg constructor for Jackson deserialization.
- * Uses default "better" strategy values.
- */
- public OCRStrategyAuto() {
- this(10, 10);
- }
-
- public OCRStrategyAuto(float unmappedUnicodeCharsPerPage, int
totalCharsPerPage) {
- this.totalCharsPerPage = totalCharsPerPage;
- this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage;
- }
-
- public float getUnmappedUnicodeCharsPerPage() {
- return unmappedUnicodeCharsPerPage;
- }
-
- public void setUnmappedUnicodeCharsPerPage(float
unmappedUnicodeCharsPerPage) {
- this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage;
- }
-
- public int getTotalCharsPerPage() {
- return totalCharsPerPage;
- }
-
- public void setTotalCharsPerPage(int totalCharsPerPage) {
- this.totalCharsPerPage = totalCharsPerPage;
- }
-
- @Override
- public String toString() {
- //TODO -- figure out if this is actual BEST or whatever
- //and return that instead of the literal values
- String unmappedString = null;
- if (unmappedUnicodeCharsPerPage < 1.0) {
- unmappedString = String.format(Locale.US, "%.03f",
- unmappedUnicodeCharsPerPage * 100) + "%";
- } else {
- unmappedString = String.format(Locale.US, "%.0f",
unmappedUnicodeCharsPerPage);
- }
- return unmappedString + "," + totalCharsPerPage;
- }
- }
-
- public enum OCR_RENDERING_STRATEGY {
- NO_TEXT, //includes vector graphics and image
- TEXT_ONLY, //renders only glyphs
- VECTOR_GRAPHICS_ONLY, //renders only vector graphics
- ALL
- //TODO: add AUTO?
- }
-
public enum IMAGE_STRATEGY {
NONE,
/**
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
index f0cdd0811e..890f7beeef 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
@@ -42,23 +42,4 @@ public class ImageGraphicsEngineFactory implements
Serializable {
processedInlineImages, imageCounter, xhtml, parentMetadata,
parseContext);
}
- /**
- * Returns the factory type for serialization purposes.
- * This allows Jackson to serialize the factory object without requiring
additional dependencies.
- *
- * @return the fully qualified class name of this factory
- */
- public String getFactoryType() {
- return getClass().getName();
- }
-
- /**
- * Setter for factory type to complete the JavaBean pattern for Jackson
deserialization.
- * This is a no-op since the factory type is derived from the class itself.
- *
- * @param factoryType the factory type (ignored)
- */
- public void setFactoryType(String factoryType) {
- // No-op: factory type is determined by the class, not set externally
- }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
index c4d3a028f8..5cc42a1611 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
@@ -228,7 +228,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer,
Initializable {
if (pdfParserConfig == null) {
return defaultImageType;
}
- return pdfParserConfig.getOcrImageType().getImageType();
+ return pdfParserConfig.getOcrImageType().getPdfBoxImageType();
}
protected String getImageFormatName(ParseContext parseContext) {
@@ -236,6 +236,6 @@ public class PDFBoxRenderer implements PDDocumentRenderer,
Initializable {
if (pdfParserConfig == null) {
return defaultImageFormatName;
}
- return pdfParserConfig.getOcrImageFormatName();
+ return pdfParserConfig.getOcrImageFormat().getFormatName();
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
deleted file mode 100644
index 2335fc16f3..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pdf;
-
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.exception.AccessPermissionException;
-import org.apache.tika.metadata.AccessPermissions;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.PropertyTypeException;
-
-public class AccessCheckerTest {
-
- @Test
- public void testLegacy() throws AccessPermissionException {
- //test that there are no thrown exceptions
- Metadata m = getMetadata(false, false);
- //legacy behavior; don't bother checking
- AccessChecker checker = new AccessChecker();
- checker.check(m);
-
- m = getMetadata(false, true);
- checker.check(m);
-
- m = getMetadata(true, true);
- checker.check(m);
- }
-
- @Test
- public void testNoExtraction() {
-
- Metadata m = null;
- //allow nothing
- AccessChecker checker = new AccessChecker(false);
- boolean ex = false;
- try {
- m = getMetadata(false, false);
- checker.check(m);
- } catch (AccessPermissionException e) {
- ex = true;
- }
- assertTrue(ex, "correct exception with no extraction, no extract for
accessibility");
- ex = false;
- try {
- //document allows extraction for accessibility
- m = getMetadata(false, true);
- checker.check(m);
- } catch (AccessPermissionException e) {
- //but application is not an accessibility application
- ex = true;
- }
- assertTrue(ex, "correct exception with no extraction, no extract for
accessibility");
- }
-
- @Test
- public void testExtractOnlyForAccessibility() throws
AccessPermissionException {
- Metadata m = getMetadata(false, true);
- //allow accessibility
- AccessChecker checker = new AccessChecker(true);
- checker.check(m);
- assertTrue(true, "no exception");
- boolean ex = false;
- try {
- m = getMetadata(false, false);
- checker.check(m);
- } catch (AccessPermissionException e) {
- ex = true;
- }
- assertTrue(ex, "correct exception");
- }
-
- @Test
- public void testIllogicalExtractNotForAccessibility() throws
AccessPermissionException {
- Metadata m = getMetadata(true, false);
- //allow accessibility
- AccessChecker checker = new AccessChecker(true);
- checker.check(m);
- assertTrue(true, "no exception");
-
- //don't extract for accessibility
- checker = new AccessChecker(false);
- //if extract content is allowed, the checker shouldn't
- //check the value of extract for accessibility
- checker.check(m);
- assertTrue(true, "no exception");
-
- }
-
- @Test
- public void testCantAddMultiplesToMetadata() {
- Metadata m = new Metadata();
- boolean ex = false;
- m.add(AccessPermissions.EXTRACT_CONTENT, "true");
- try {
- m.add(AccessPermissions.EXTRACT_CONTENT, "false");
- } catch (PropertyTypeException e) {
- ex = true;
- }
- assertTrue(ex, "can't add multiple values");
-
- m = new Metadata();
- ex = false;
- m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "true");
- try {
- m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "false");
- } catch (PropertyTypeException e) {
- ex = true;
- }
- assertTrue(ex, "can't add multiple values");
- }
-
- private Metadata getMetadata(boolean allowExtraction, boolean
allowExtractionForAccessibility) {
- Metadata m = new Metadata();
- m.set(AccessPermissions.EXTRACT_CONTENT,
Boolean.toString(allowExtraction));
- m.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
- Boolean.toString(allowExtractionForAccessibility));
- return m;
- }
-}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java
index e1bf5d5000..aef622b400 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.pdf;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
+import com.fasterxml.jackson.annotation.JsonTypeInfo;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDPage;
@@ -29,8 +30,52 @@ import org.apache.tika.parser.pdf.image.ImageGraphicsEngine;
import org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory;
import org.apache.tika.sax.XHTMLContentHandler;
+/**
+ * Example custom ImageGraphicsEngineFactory demonstrating how users can create
+ * their own factory implementations with custom configuration parameters.
+ * <p>
+ * <b>JSON Config File Usage:</b> Use the class name string approach:
+ * <pre>
+ * {
+ * "pdf-parser": {
+ * "imageGraphicsEngineFactoryClass": "com.example.MyCustomFactory"
+ * }
+ * }
+ * </pre>
+ * Note: This approach does not support custom parameters; the factory will
use default values.
+ * <p>
+ * <b>ParseContext Serialization:</b> The {@code @JsonTypeInfo} annotation
enables polymorphic
+ * serialization when using tika-serialization's polymorphic ObjectMapper
(e.g., for
+ * ParseContext round-trip serialization). This requires the annotation on
both the base
+ * class and subclass for full polymorphic support.
+ */
+@JsonTypeInfo(use = JsonTypeInfo.Id.CLASS, property = "@class")
public class MyCustomImageGraphicsEngineFactory extends
ImageGraphicsEngineFactory {
+ /**
+ * Metadata key used to record that this custom factory was used during
parsing.
+ */
+ public static final String CUSTOM_FACTORY_USED =
"X-CustomGraphicsEngineFactory-Used";
+
+ /**
+ * Metadata key used to record the customParam value.
+ */
+ public static final String CUSTOM_PARAM_KEY =
"X-CustomGraphicsEngineFactory-CustomParam";
+
+ private String customParam = "default";
+
+ public MyCustomImageGraphicsEngineFactory() {
+ // Default constructor required for Jackson deserialization
+ }
+
+ public String getCustomParam() {
+ return customParam;
+ }
+
+ public void setCustomParam(String customParam) {
+ this.customParam = customParam;
+ }
+
@Override
public ImageGraphicsEngine newEngine(PDPage page,
int pageNumber,
@@ -39,6 +84,12 @@ public class MyCustomImageGraphicsEngineFactory extends
ImageGraphicsEngineFacto
Map<COSStream, Integer>
processedInlineImages,
AtomicInteger imageCounter,
XHTMLContentHandler xhtml,
Metadata parentMetadata, ParseContext
parseContext) {
- throw new RuntimeException("testing123");
+ // Record that this custom factory was used
+ parentMetadata.set(CUSTOM_FACTORY_USED, "true");
+ parentMetadata.set(CUSTOM_PARAM_KEY, customParam);
+
+ // Delegate to the default implementation
+ return super.newEngine(page, pageNumber, embeddedDocumentExtractor,
pdfParserConfig,
+ processedInlineImages, imageCounter, xhtml, parentMetadata,
parseContext);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index b4645386db..35ac2dbf00 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -19,13 +19,12 @@ package org.apache.tika.parser.pdf;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
-import static org.junit.jupiter.api.Assertions.fail;
import java.io.InputStream;
+import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
@@ -46,10 +45,9 @@ import org.xml.sax.ContentHandler;
import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
-import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.exception.AccessPermissionException;
import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.Font;
@@ -70,7 +68,6 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
-import org.apache.tika.utils.ExceptionUtils;
/**
* Test case for parsing pdf files.
@@ -883,7 +880,7 @@ public class PDFParserTest extends TikaTest {
PDFParserConfig config = new PDFParserConfig();
//don't allow extraction, not even for accessibility
- config.setAccessChecker(new AccessChecker(false));
+
config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.IGNORE_ACCESSIBILITY_ALLOWANCE);
ParseContext context = new ParseContext();
context.set(PDFParserConfig.class, config);
@@ -894,7 +891,7 @@ public class PDFParserTest extends TikaTest {
AccessPermissionException.class);
}
- config.setAccessChecker(new AccessChecker(true));
+
config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY);
assertException("/test-documents/" +
"testPDF_no_extract_no_accessibility_owner_empty.pdf",
AUTO_DETECT_PARSER, context, AccessPermissionException.class);
@@ -908,7 +905,7 @@ public class PDFParserTest extends TikaTest {
PDFParserConfig config = new PDFParserConfig();
//don't allow extraction, not even for accessibility
- config.setAccessChecker(new AccessChecker(false));
+
config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.IGNORE_ACCESSIBILITY_ALLOWANCE);
PasswordProvider passwordProvider = new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
@@ -927,7 +924,7 @@ public class PDFParserTest extends TikaTest {
}
//bad password is still a bad password
- config.setAccessChecker(new AccessChecker(true));
+
config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY);
for (String path : new
String[]{"testPDF_no_extract_no_accessibility_owner_empty.pdf",
"testPDF_no_extract_yes_accessibility_owner_empty.pdf",}) {
assertException("/test-documents/" + path, AUTO_DETECT_PARSER,
context,
@@ -941,7 +938,7 @@ public class PDFParserTest extends TikaTest {
assertContains("Hello World",
getXML("testPDF_no_extract_yes_accessibility_owner_user.pdf",
context).xml);
- config.setAccessChecker(new AccessChecker(false));
+
config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.IGNORE_ACCESSIBILITY_ALLOWANCE);
for (String path : new
String[]{"testPDF_no_extract_no_accessibility_owner_user.pdf",
"testPDF_no_extract_yes_accessibility_owner_user.pdf",}) {
assertException("/test-documents/" + path, AUTO_DETECT_PARSER,
context,
@@ -955,7 +952,7 @@ public class PDFParserTest extends TikaTest {
PDFParserConfig config = new PDFParserConfig();
//don't allow extraction, not even for accessibility
- config.setAccessChecker(new AccessChecker(true));
+
config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY);
PasswordProvider passwordProvider = new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
@@ -977,7 +974,7 @@ public class PDFParserTest extends TikaTest {
}
//really, with owner's password, all extraction is allowed
- config.setAccessChecker(new AccessChecker(false));
+
config.setAccessCheckMode(PDFParserConfig.AccessCheckMode.IGNORE_ACCESSIBILITY_ALLOWANCE);
for (String path : new
String[]{"testPDF_no_extract_no_accessibility_owner_user.pdf",
"testPDF_no_extract_yes_accessibility_owner_user.pdf",
"testPDF_no_extract_no_accessibility_owner_empty.pdf",
@@ -1101,45 +1098,43 @@ public class PDFParserTest extends TikaTest {
@Test
public void testInitializationViaConfig() throws Exception {
- try (InputStream is = getResourceAsStream(
- "/org/apache/tika/parser/pdf/tika-config.xml")) {
- assertNotNull(is);
- TikaConfig tikaConfig = new TikaConfig(is);
- Parser p = new AutoDetectParser(tikaConfig);
-
- String text =
-
getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p);
- text = text.replaceAll("\\s+", " ");
-
- // Column text is now interleaved:
- assertContains(
- "Left column line 1 Right column line 1 " +
- "Left colu mn line 2 Right column line 2",
- text);
-
- //test overriding underlying settings with PDFParserConfig
- ParseContext pc = new ParseContext();
- PDFParserConfig config = new PDFParserConfig();
- config.setSortByPosition(false);
- pc.set(PDFParserConfig.class, config);
- text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc);
- text = text.replaceAll("\\s+", " ");
- // Column text is not interleaved:
- assertContains("Left column line 1 Left column line 2 ", text);
-
- //test a new PDFParserConfig and setting another value
- //this tests that a new PDFParserConfig completely resets
- //behavior
- config = new PDFParserConfig();
- config.setOcrDPI(10000);
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
- pc.set(PDFParserConfig.class, config);
- text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc);
- text = text.replaceAll("\\s+", " ");
-
- // Column text is not interleaved:
- assertContains("Left column line 1 Left column line 2 ", text);
- }
+ Path configPath = Path.of(getClass().getResource(
+ "/org/apache/tika/parser/pdf/tika-config.json").toURI());
+ TikaLoader loader = TikaLoader.load(configPath);
+ Parser p = loader.loadAutoDetectParser();
+
+ String text =
+
getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p);
+ text = text.replaceAll("\\s+", " ");
+
+ // Column text is now interleaved:
+ assertContains(
+ "Left column line 1 Right column line 1 " +
+ "Left colu mn line 2 Right column line 2",
+ text);
+
+ //test overriding underlying settings with PDFParserConfig
+ ParseContext pc = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ config.setSortByPosition(false);
+ pc.set(PDFParserConfig.class, config);
+ text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc);
+ text = text.replaceAll("\\s+", " ");
+ // Column text is not interleaved:
+ assertContains("Left column line 1 Left column line 2 ", text);
+
+ //test a new PDFParserConfig and setting another value
+ //this tests that a new PDFParserConfig completely resets
+ //behavior
+ config = new PDFParserConfig();
+ config.setOcrDPI(10000);
+ config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
+ pc.set(PDFParserConfig.class, config);
+ text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc);
+ text = text.replaceAll("\\s+", " ");
+
+ // Column text is not interleaved:
+ assertContains("Left column line 1 Left column line 2 ", text);
}
// Moved to tika-parsers-standard-package
PDFParserTest.testInitializationOfNonPrimitivesViaJsonConfig
@@ -1162,30 +1157,28 @@ public class PDFParserTest extends TikaTest {
@Test
public void testConfiguringMoreParams() throws Exception {
- try (InputStream configIs = getResourceAsStream(
- "/org/apache/tika/parser/pdf/tika-inline-config.xml")) {
- assertNotNull(configIs);
- TikaConfig tikaConfig = new TikaConfig(configIs);
- AutoDetectParser p = new AutoDetectParser(tikaConfig);
- //make absolutely certain the functionality works!
- List<Metadata> metadata = getRecursiveMetadata("testOCR.pdf", p);
- assertEquals(2, metadata.size());
- Map<MediaType, Parser> parsers = p.getParsers();
- Parser composite = parsers.get(MediaType.application("pdf"));
- Parser pdfParser =
- ((CompositeParser)
composite).getParsers().get(MediaType.application("pdf"));
- assertTrue(pdfParser instanceof PDFParser);
- PDFParserConfig pdfParserConfig = ((PDFParser)
pdfParser).getPDFParserConfig();
- assertEquals(new AccessChecker(true),
pdfParserConfig.getAccessChecker());
- assertEquals(true, pdfParserConfig.isExtractInlineImages());
- assertEquals(false,
pdfParserConfig.isExtractUniqueInlineImagesOnly());
- assertEquals(314, pdfParserConfig.getOcrDPI());
- assertEquals(2.1f, pdfParserConfig.getOcrImageQuality(), .01f);
- assertEquals("jpeg", pdfParserConfig.getOcrImageFormatName());
- assertEquals(524288000, pdfParserConfig.getMaxMainMemoryBytes());
- assertEquals(false,
pdfParserConfig.isCatchIntermediateIOExceptions());
-
- }
+ Path configPath = Path.of(getClass().getResource(
+
"/org/apache/tika/parser/pdf/tika-inline-config.json").toURI());
+ TikaLoader loader = TikaLoader.load(configPath);
+ AutoDetectParser p = (AutoDetectParser) loader.loadAutoDetectParser();
+ //make absolutely certain the functionality works!
+ List<Metadata> metadata = getRecursiveMetadata("testOCR.pdf", p);
+ assertEquals(2, metadata.size());
+ Map<MediaType, Parser> parsers = p.getParsers();
+ Parser composite = parsers.get(MediaType.application("pdf"));
+ Parser pdfParser =
+ ((CompositeParser)
composite).getParsers().get(MediaType.application("pdf"));
+ assertTrue(pdfParser instanceof PDFParser);
+ PDFParserConfig pdfParserConfig = ((PDFParser)
pdfParser).getPDFParserConfig();
+
assertEquals(PDFParserConfig.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY,
+ pdfParserConfig.getAccessCheckMode());
+ assertEquals(true, pdfParserConfig.isExtractInlineImages());
+ assertEquals(false, pdfParserConfig.isExtractUniqueInlineImagesOnly());
+ assertEquals(314, pdfParserConfig.getOcrDPI());
+ assertEquals(2.1f, pdfParserConfig.getOcrImageQuality(), .01f);
+ assertEquals(OcrConfig.ImageFormat.JPEG,
pdfParserConfig.getOcrImageFormat());
+ assertEquals(524288000, pdfParserConfig.getMaxMainMemoryBytes());
+ assertEquals(false, pdfParserConfig.isCatchIntermediateIOExceptions());
}
//TODO: figure out how to test jp2 embedded with OCR
@@ -1395,20 +1388,22 @@ public class PDFParserTest extends TikaTest {
@Test
public void testCustomGraphicsEngineFactory() throws Exception {
- try (InputStream is =
- getResourceAsStream(
- "tika-config-custom-graphics-engine.xml")) {
- assertNotNull(is);
- TikaConfig tikaConfig = new TikaConfig(is);
- Parser p = new AutoDetectParser(tikaConfig);
- try {
- List<Metadata> metadataList =
getRecursiveMetadata("testPDF_JBIG2.pdf", p);
- fail("should have thrown a runtime exception");
- } catch (TikaException e) {
- String stack = ExceptionUtils.getStackTrace(e);
- assertContains("testing123", stack);
- }
- }
+ Path configPath = Path.of(getClass().getResource(
+ "tika-config-custom-graphics-engine.json").toURI());
+ TikaLoader loader = TikaLoader.load(configPath);
+ Parser p = loader.loadAutoDetectParser();
+
+ // Parse a PDF with inline images to trigger the custom graphics
engine factory
+ List<Metadata> metadataList =
getRecursiveMetadata("testPDF_JBIG2.pdf", p);
+
+ // Verify the custom factory was used
+ // Note: customParam uses default value since JSON config uses class
name string
+ // (polymorphic config with params requires @JsonTypeInfo on base
class)
+ Metadata metadata = metadataList.get(0);
+ assertEquals("true",
metadata.get(MyCustomImageGraphicsEngineFactory.CUSTOM_FACTORY_USED),
+ "Custom graphics engine factory should have been used");
+ assertEquals("default",
metadata.get(MyCustomImageGraphicsEngineFactory.CUSTOM_PARAM_KEY),
+ "customParam should have default value when using class name
string config");
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.json
new file mode 100644
index 0000000000..2411ca211e
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.json
@@ -0,0 +1,11 @@
+{
+ "parsers": [
+ {
+ "pdf-parser": {
+ "sortByPosition": true,
+ "extractInlineImages": true,
+ "imageGraphicsEngineFactoryClass":
"org.apache.tika.parser.pdf.MyCustomImageGraphicsEngineFactory"
+ }
+ }
+ ]
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.xml
deleted file mode 100644
index 5aa259feeb..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.xml
+++ /dev/null
@@ -1,28 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="sortByPosition" type="bool">true</param>
- <param name="extractInlineImages" type="bool">true</param>
- <param name="imageGraphicsEngineFactory"
class="org.apache.tika.parser.pdf.MyCustomImageGraphicsEngineFactory"/>
- </params>
- </parser>
- </parsers>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml
deleted file mode 100644
index 3cc9d8b237..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml
+++ /dev/null
@@ -1,29 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="sortByPosition" type="bool">true</param>
- <param name="ocrImageType" type="string">rgb</param>
- <param name="ocrStrategy" type="string">ocr_only</param>
-
- </params>
- </parser>
- </parsers>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.json
new file mode 100644
index 0000000000..a16c7e4d96
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.json
@@ -0,0 +1,9 @@
+{
+ "parsers": [
+ {
+ "pdf-parser": {
+ "sortByPosition": true
+ }
+ }
+ ]
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml
deleted file mode 100644
index 98940da24b..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml
+++ /dev/null
@@ -1,26 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="sortByPosition" type="bool">true</param>
- </params>
- </parser>
- </parsers>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.json
new file mode 100644
index 0000000000..deaea70b79
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.json
@@ -0,0 +1,19 @@
+{
+ "parsers": [
+ {
+ "default-parser": {}
+ },
+ {
+ "pdf-parser": {
+ "extractInlineImages": true,
+ "accessCheckMode": "ALLOW_EXTRACTION_FOR_ACCESSIBILITY",
+ "catchIntermediateIOExceptions": false,
+ "extractUniqueInlineImagesOnly": false,
+ "ocrDPI": 314,
+ "ocrImageQuality": 2.1,
+ "ocrImageFormat": "JPEG",
+ "maxMainMemoryBytes": 524288000
+ }
+ }
+ ]
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
deleted file mode 100644
index 9124c89244..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser"/>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="extractInlineImages" type="bool">true</param>
- <param name="allowExtractionForAccessibility"
type="bool">true</param>
- <param name="catchIntermediateExceptions"
type="bool">false</param>
- <param name="extractUniqueInlineImagesOnly"
type="bool">false</param>
- <param name="catchIntermediateExceptions"
type="bool">false</param>
- <param name="ocrDPI" type="int">314</param>
- <param name="ocrImageQuality" type="float">2.1</param>
- <param name="ocrImageFormatName" type="string">jpeg</param>
- <param name="ocrImageScale" type="float">1.3</param>
- <param name="maxMainMemoryBytes" type="long">524288000</param>
- <!-- we really should throw an exception for this!! -->
- <param name="someRandomThingOrOther" type="bool">true</param>
- </params>
- </parser>
- </parsers>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml
deleted file mode 100644
index e187601190..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml
+++ /dev/null
@@ -1,36 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude
class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
- <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
- </parser>
- <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
- <params>
- <param name="maxFileSizeToOcr" type="long">100</param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="extractInlineImages" type="bool">false</param>
- <param name="ocrStrategy" type="string">ocr_only</param>
- </params>
- </parser>
- </parsers>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
deleted file mode 100644
index 92f351bb9a..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
- </parser>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="imageStrategy"
type="string">renderPagesBeforeParse</param>
- </params>
- </parser>
- </parsers>
-<!--
- This will be supplied automatically if not specified.
- <renderers>
- <renderer class="org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer"/>
- </renderers> -->
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-per-page-config.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-per-page-config.xml
deleted file mode 100644
index e3f92df6c9..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-per-page-config.xml
+++ /dev/null
@@ -1,32 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
- </parser>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="imageStrategy"
type="string">renderPagesAtPageEnd</param>
- </params>
- </parser>
- </parsers>
-<!-- <renderers>
- <renderer class="org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer"/>
- </renderers> -->
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml
deleted file mode 100644
index 20adbf2880..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-xml-profiler-config.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- </parser>
- <parser class="org.apache.tika.parser.xml.XMLProfiler"/>
- </parsers>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
index 9cb5df8d52..1850664864 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
@@ -38,8 +38,8 @@ public class TikaConfigSerializerTest {
assertContains(detectorNeedle, xml);
String parserNeedle = "<parser
class=\"org.apache.tika.parser.pdf.PDFParser\">" +
- " <params> <param name=\"allowExtractionForAccessibility\" " +
- "type=\"bool\">true</param>";
+ " <params> <param name=\"accessCheckMode\" " +
+ "type=\"string\">DONT_CHECK</param>";
assertContains(parserNeedle, xml);
//TODO This is still to be implemented -- we do not want to show the
default renderer here
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
index 18c1314599..140a82d5f0 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
@@ -27,6 +27,7 @@ import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.OcrConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
public class TSDParserTest extends TikaTest {
@@ -35,7 +36,7 @@ public class TSDParserTest extends TikaTest {
public void testBrokenPdf() throws Exception {
ParseContext parseContext = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+ config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
parseContext.set(PDFParserConfig.class, config);
//make sure that embedded file appears in list
//and make sure embedded exception is recorded
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index d0aa70519e..d89beadcfd 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -113,7 +113,7 @@ public class PDFParserTest extends TikaTest {
private static ParseContext NO_OCR() {
PDFParserConfig config = new PDFParserConfig();
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+ config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
ParseContext context = new ParseContext();
context.set(PDFParserConfig.class, config);
return context;
@@ -230,7 +230,7 @@ public class PDFParserTest extends TikaTest {
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
config.setExtractUniqueInlineImagesOnly(false);
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+ config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
context.set(org.apache.tika.parser.Parser.class, p);
@@ -260,15 +260,15 @@ public class PDFParserTest extends TikaTest {
public void testEmbeddedDocsWithOCROnly() throws Exception {
assumeTrue(canRunOCR(), "can't run OCR");
//test default is "auto"
- assertEquals(PDFParserConfig.OCR_STRATEGY.AUTO, new
PDFParserConfig().getOcrStrategy());
+ assertEquals(OcrConfig.Strategy.AUTO, new
PDFParserConfig().getOcrStrategy());
testStrategy(null);
//now test other options
- for (PDFParserConfig.OCR_STRATEGY strategy :
PDFParserConfig.OCR_STRATEGY.values()) {
+ for (OcrConfig.Strategy strategy : OcrConfig.Strategy.values()) {
testStrategy(strategy);
}
}
- private void testStrategy(PDFParserConfig.OCR_STRATEGY strategy) throws
Exception {
+ private void testStrategy(OcrConfig.Strategy strategy) throws Exception {
//make sure everything works with regular xml _and_ with recursive
ParseContext context = new ParseContext();
if (strategy != null) {
@@ -277,7 +277,7 @@ public class PDFParserTest extends TikaTest {
context.set(PDFParserConfig.class, config);
};
PDFParserConfig config = context.get(PDFParserConfig.class, new
PDFParserConfig());
-
config.setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY.ALL);
+ config.setOcrRenderingStrategy(OcrConfig.RenderingStrategy.ALL);
context.set(PDFParserConfig.class, config);
XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx",
context);
@@ -289,7 +289,7 @@ public class PDFParserTest extends TikaTest {
}
assertContains("Haystack", xmlResult.xml);
assertContains("Needle", xmlResult.xml);
- if (strategy == null || strategy !=
PDFParserConfig.OCR_STRATEGY.NO_OCR) {
+ if (strategy == null || strategy != OcrConfig.Strategy.NO_OCR) {
// Tesseract may see the t in haystack as a ! some times...
//or it might see dehayslack...
//TODO: figure out how to make this test less hacky
@@ -328,7 +328,7 @@ public class PDFParserTest extends TikaTest {
//TIKA-1990, test that an embedded jpeg is correctly decoded
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+ config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
ParseContext context = new ParseContext();
context.set(PDFParserConfig.class, config);
@@ -349,7 +349,7 @@ public class PDFParserTest extends TikaTest {
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
config.setExtractUniqueInlineImagesOnly(false);
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+ config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
context.set(PDFParserConfig.class, config);
@@ -376,7 +376,7 @@ public class PDFParserTest extends TikaTest {
public void testJBIG2OCROnly() throws Exception {
assumeTrue(canRunOCR(), "can't run OCR");
PDFParserConfig config = new PDFParserConfig();
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
+ config.setOcrStrategy(OcrConfig.Strategy.OCR_ONLY);
ParseContext context = new ParseContext();
context.set(PDFParserConfig.class, config);
//make sure everything works with regular xml _and_ with recursive
@@ -388,7 +388,7 @@ public class PDFParserTest extends TikaTest {
public void testJPEG2000() throws Exception {
assumeTrue(canRunOCR(), "can't run OCR");
PDFParserConfig config = new PDFParserConfig();
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
+ config.setOcrStrategy(OcrConfig.Strategy.OCR_ONLY);
ParseContext context = new ParseContext();
context.set(PDFParserConfig.class, config);
//make sure everything works with regular xml _and_ with recursive
@@ -404,13 +404,13 @@ public class PDFParserTest extends TikaTest {
assertContains("Happy New Year", getXML("testOCR.pdf").xml);
PDFParserConfig config = new PDFParserConfig();
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.AUTO);
+ config.setOcrStrategy(OcrConfig.Strategy.AUTO);
ParseContext context = new ParseContext();
context.set(PDFParserConfig.class, config);
XMLResult xmlResult = getXML("testOCR.pdf", context);
assertContains("Happy New Year", xmlResult.xml);
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+ config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
String txt = getText("testOCR.pdf", new Metadata(), context);
assertEquals("", txt.trim());
}
@@ -419,16 +419,16 @@ public class PDFParserTest extends TikaTest {
public void testOCRNoText() throws Exception {
assumeTrue(canRunOCR(), "can't run OCR");
PDFParserConfig config = new PDFParserConfig();
-
config.setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY.ALL);
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
+ config.setOcrRenderingStrategy(OcrConfig.RenderingStrategy.ALL);
+ config.setOcrStrategy(OcrConfig.Strategy.OCR_ONLY);
ParseContext parseContext = new ParseContext();
parseContext.set(PDFParserConfig.class, config);
XMLResult xmlResult = getXML("testPDF_XFA_govdocs1_258578.pdf",
parseContext);
assertContains("PARK", xmlResult.xml);
assertContains("Applications", xmlResult.xml);
-
config.setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY.NO_TEXT);
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
+ config.setOcrRenderingStrategy(OcrConfig.RenderingStrategy.NO_TEXT);
+ config.setOcrStrategy(OcrConfig.Strategy.OCR_ONLY);
parseContext.set(PDFParserConfig.class, config);
xmlResult = getXML("testPDF_XFA_govdocs1_258578.pdf", parseContext);
assertContains("NATIONAL", xmlResult.xml);
@@ -583,7 +583,7 @@ public class PDFParserTest extends TikaTest {
PDFParserConfig config = new PDFParserConfig();
config.setSortByPosition(true);
config.setExtractInlineImages(true);
- config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.AUTO);
+ config.setOcrStrategy(OcrConfig.Strategy.AUTO);
ParseContext parseContext = new ParseContext();
parseContext.set(PDFParserConfig.class, config);
@@ -607,7 +607,7 @@ public class PDFParserTest extends TikaTest {
"sortByPosition should be preserved");
assertTrue(deserializedConfig.isExtractInlineImages(),
"extractInlineImages should be preserved");
- assertEquals(PDFParserConfig.OCR_STRATEGY.AUTO,
deserializedConfig.getOcrStrategy(),
+ assertEquals(OcrConfig.Strategy.AUTO,
deserializedConfig.getOcrStrategy(),
"ocrStrategy should be preserved");
}
@@ -661,9 +661,9 @@ public class PDFParserTest extends TikaTest {
.get(MediaType.application("pdf"));
assertEquals("org.apache.tika.parser.pdf.PDFParser",
pdfParser.getClass().getName());
- assertEquals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY,
+ assertEquals(OcrConfig.Strategy.OCR_ONLY,
((PDFParser) pdfParser).getPDFParserConfig().getOcrStrategy());
- assertEquals(PDFParserConfig.TikaImageType.RGB,
+ assertEquals(OcrConfig.ImageType.RGB,
((PDFParser)
pdfParser).getPDFParserConfig().getOcrImageType());
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
index 011f149cc5..518bffd160 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
@@ -104,7 +104,7 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
} catch (ClassNotFoundException e) {
LOG.debug("Class not found for key '{}', storing in
ConfigContainer", fieldName);
} catch (Exception e) {
- LOG.warn("Failed to deserialize '{}' directly, storing in
ConfigContainer", fieldName, e);
+ throw new IOException("Failed to deserialize '" +
fieldName + "': " + e.getMessage(), e);
}
}
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
index e961d1ccc7..05750c9812 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
@@ -83,7 +83,7 @@ public class UnpackerResourceWithConfigTest extends
CXFTestBase {
"pdf-parser": {
"imageStrategy": "RENDER_PAGES_AT_PAGE_END",
"ocrImageType": "RGB",
- "ocrImageFormatName": "tiff"
+ "ocrImageFormat": "TIFF"
}
}
""";
@@ -146,7 +146,7 @@ public class UnpackerResourceWithConfigTest extends
CXFTestBase {
"pdf-parser": {
"imageStrategy": "RENDER_PAGES_AT_PAGE_END",
"ocrImageType": "GRAY",
- "ocrImageFormatName": "jpeg"
+ "ocrImageFormat": "JPEG"
}
}
""";