This is an automated email from the ASF dual-hosted git repository. rombert pushed a commit to branch issue/OAK-12046 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit 2826884b50e3065a6d94d4b1e8b274c2e7e46849 Author: Robert Munteanu <[email protected]> AuthorDate: Mon Dec 22 14:12:21 2025 +0100 OAK-12046 - Update default Tika config --- .../jackrabbit/oak/plugins/index/lucene/tika-config.xml | 11 ++++++++++- .../oak/plugins/index/search/spi/editor/tika-config.xml | 11 ++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml b/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml index 54f229d699..07e002c479 100644 --- a/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml +++ b/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml @@ -24,7 +24,16 @@ <detector class="org.apache.tika.detect.TypeDetector"/> </detectors> <parsers> - <parser class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.DefaultParser"> + <!-- the PDF parser is configured below --> + <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> + </parser> + <parser class="org.apache.tika.parser.pdf.PDFParser"> + <params> + <!-- Disable XFA/AcroForm extraction --> + <param name="extractAcroFormContent" type="bool">false</param> + </params> + </parser> <parser class="org.apache.tika.parser.EmptyParser"> <!-- Disable package extraction as it's too resource-intensive --> <mime>application/x-archive</mime> diff --git a/oak-search/src/main/resources/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/tika-config.xml b/oak-search/src/main/resources/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/tika-config.xml index 54f229d699..07e002c479 100644 --- a/oak-search/src/main/resources/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/tika-config.xml +++ b/oak-search/src/main/resources/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/tika-config.xml @@ -24,7 +24,16 @@ <detector class="org.apache.tika.detect.TypeDetector"/> </detectors> <parsers> - <parser class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.DefaultParser"> + <!-- the PDF parser is configured below --> + <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> + </parser> + <parser class="org.apache.tika.parser.pdf.PDFParser"> + <params> + <!-- Disable XFA/AcroForm extraction --> + <param name="extractAcroFormContent" type="bool">false</param> + </params> + </parser> <parser class="org.apache.tika.parser.EmptyParser"> <!-- Disable package extraction as it's too resource-intensive --> <mime>application/x-archive</mime>
