Try something like this:
<properties>
<parsers>
<parser class="org.apache.tika.parser.DefaultParser">
<parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
</parser>
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
<param name="timeoutSeconds" type="int">180</param>
</params>
</parser>
</parsers>
<server stuff../>
</properties>
On Tue, Jul 26, 2022 at 6:52 AM PGNet Dev <[email protected]> wrote:
> removing dovecot from the equation, reduced this to just tika,
> reproducible here
>
> running
>
> ls -al /srv/tika/tika-server.jar
> lrwxrwxrwx 1 root root 50 Jul 26 05:42
> /srv/tika/tika-server.jar ->
> tika-server-standard-2.4.2-20220725.215245-121.jar
>
> systemctl status tika -ln0
> ● tika.service - Apache Tika server
> Loaded: loaded (/etc/systemd/system/tika.service;
> enabled; vendor preset: disabled)
> Active: active (running) since Tue 2022-07-26
> 05:43:01 EDT; 29min ago
> Main PID: 10829 (java)
> Tasks: 53 (limit: 8812)
> Memory: 215.9M
> CPU: 37.667s
> CGroup: /system.slice/tika.service
> ├─ 10829 /usr/bin/java
> -Dpdfbox.fontcache=/var/tika -XX:ParallelGCThreads=1 -XX:CICompilerCount=2
> -XX:-CICompilerCountPerCPU -jar /srv/tika/tika-server.jar -c
> /etc/tika/tika-server-config-custom.xml --host 127.0.0.1 --port 9998
> └─ 10863 /usr/bin/java -Xms1g -Xmx1g
> -Dpdfbox.fontcache=/var/tika -Dlog4j2.debug -Djava.awt.headless=true -cp
> /srv/tika/tika-server.jar -Dtika.server.id=
> org.apache.tika.server.core.TikaServerProcess -h 127.0.0.1 -p 9998 -i "" -c
> /etc/tika/tika-server-config-custom.xml -forkedStatusFile
> /tmp/apache-tika-server-forked-tmp-12945021525641519393 -numRestarts 0
>
> on
>
> lsb_release -rd
> Description: Fedora release 36 (Thirty Six)
> Release: 36
>
> with
>
> tesseract --version
> tesseract 5.0.1
> leptonica-1.82.0
> libgif 5.2.1 : libjpeg 6b (libjpeg-turbo 2.1.2) : libpng
> 1.6.37 : libtiff 4.4.0 : zlib 1.2.11 : libwebp 1.2.3
> Found OpenMP 201511
>
> stream --version
> Version: ImageMagick 7.1.0-44 Q16-HDRI x86_64 20294
> https://imagemagick.org
> Copyright: (C) 1999 ImageMagick Studio LLC
> License: https://imagemagick.org/script/license.php
> Features: Cipher DPC HDRI Modules OpenMP(4.5)
> Delegates (built-in): bzlib cairo djvu fontconfig freetype
> gslib gvc heic jbig jng jp2 jpeg lcms lqr ltdl lzma openexr pangocairo png
> ps raqm raw rsvg tiff webp wmf x xml zip zlib
> Compiler: gcc (12.1)
>
> java -version
> Picked up JAVA_TOOL_OPTIONS: -Xmx512M
> openjdk version "18.0.1.1" 2022-04-22
> OpenJDK Runtime Environment 22.3 (build 18.0.1.1+2)
> OpenJDK 64-Bit Server VM 22.3 (build 18.0.1.1+2, mixed
> mode, sharing)
>
> & custom config
>
> cat /etc/tika/tika-server-config-custom.xml
> <?xml version="1.0" encoding="UTF-8"?>
> <properties>
> <parsers>
> </parsers>
> <server>
> <params>
> <logLevel>debug</logLevel>
> <javaPath>/usr/bin/java</javaPath>
> <noFork>false</noFork>
> <forkedJvmArgs>
> <arg>-Xms1g</arg>
> <arg>-Xmx1g</arg>
> <arg>-Dpdfbox.fontcache=/var/tika</arg>
> <arg>-Dlog4j2.debug</arg>
> </forkedJvmArgs>
> <digest>sha256</digest>
>
> <enableUnsecureFeatures>false</enableUnsecureFeatures>
> <id></id>
> <maxFiles>100000</maxFiles>
>
> <maxForkedStartupMillis>120000</maxForkedStartupMillis>
> <maxRestarts>-1</maxRestarts>
> <minimumTimeoutMillis>30000</minimumTimeoutMillis>
> <returnStackTrace>false</returnStackTrace>
> <taskPulseMillis>10000</taskPulseMillis>
> <taskTimeoutMillis>300000</taskTimeoutMillis>
> <endpoints>
> <endpoint>tika</endpoint>
> <endpoint>status</endpoint>
> <endpoint>rmeta</endpoint>
> </endpoints>
> </params>
> </server>
> </properties>
>
> on exec, passing a test pdf,
>
> curl -T ~/Get_Started_With_Smallpdf.pdf http://127.0.0.1:9998/meta
>
> complete metadata's returned
>
> <x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP
> Core Test.SNAPSHOT">
> <rdf:RDF xmlns:rdf="
> http://www.w3.org/1999/02/22-rdf-syntax-ns#">
> <rdf:Description rdf:about=""
> xmlns:pdf="http://ns.adobe.com/pdf/1.3/"
> xmlns:xmp="http://ns.adobe.com/xap/1.0/"
> xmlns:dc="http://purl.org/dc/elements/1.1/"
> xmlns:xmpMM="http://ns.adobe.com/xap/1.0/mm/"
> xmlns:xmpTPg="http://ns.adobe.com/xap/1.0/t/pg/"
> pdf:PDFVersion="1.7"
> pdf:hasXFA="false"
> pdf:hasCollection="false"
> pdf:encrypted="false"
> pdf:hasMarkedContent="false"
> pdf:producer="Adobe PDF Library 15.0"
> pdf:hasXMP="true"
> xmp:CreatorTool="Adobe InDesign 15.1 (Macintosh)"
> xmp:CreateDate="2020-10-14T17:08:10Z"
> xmp:ModifyDate="2020-10-14T17:08:10Z"
> xmp:MetadataDate="2020-10-14T17:08:10Z"
> dc:format="application/pdf; version=1.7"
> dc:language="en-US"
> xmpMM:DocumentID="xmp.id:7
> a865d84-8dbf-4015-96b7-fdae89a9603b"
> xmpTPg:NPages="1">
> <pdf:unmappedUnicodeCharsPerPage>
> <rdf:Seq>
> <rdf:li>0</rdf:li>
> </rdf:Seq>
> </pdf:unmappedUnicodeCharsPerPage>
> <pdf:charsPerPage>
> <rdf:Seq>
> <rdf:li>794</rdf:li>
> </rdf:Seq>
> </pdf:charsPerPage>
> <pdf:annotationTypes>
> <rdf:Bag>
>
> <rdf:li>95e8dd6e9b4c5a3d-3d44cd989a3a348c</rdf:li>
>
> <rdf:li>95e8dd6f9b4c5a3e-3d44cd979a3a348b</rdf:li>
>
> <rdf:li>95e8dd709b4c5a3f-3d44cd969a3a348a</rdf:li>
>
> <rdf:li>95e8dd719b4c5a40-3d44cd959a3a3489</rdf:li>
>
> <rdf:li>95e8dd729b4c5a41-3d44cd949a3a3488</rdf:li>
> </rdf:Bag>
> </pdf:annotationTypes>
> <pdf:annotationSubtypes>
> <rdf:Bag>
> <rdf:li>Link</rdf:li>
> </rdf:Bag>
> </pdf:annotationSubtypes>
> </rdf:Description>
> </rdf:RDF>
> </x:xmpmeta>
>
> if i add TesseractOCRParser class config to the above, for simple param
> override
>
> cat /etc/tika/tika-server-config-custom.xml
> <?xml version="1.0" encoding="UTF-8"?>
> <properties>
> <parsers>
> + <parser
> class="org.apache.tika.parser.ocr.TesseractOCRParser">
> + <params>
> + <param name="timeoutSeconds"
> type="int">180</param>
> + </params>
> + </parser>
> </parsers>
> ...
>
> exec
>
> systemctl restart tika
> curl -T ~/Get_Started_With_Smallpdf.pdf http://127.0.0.1:9998/meta
>
> returns incomplete/truncated data
>
> <x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP
> Core Test.SNAPSHOT">
> <rdf:RDF xmlns:rdf="
> http://www.w3.org/1999/02/22-rdf-syntax-ns#">
> <rdf:Description rdf:about=""/>
> </rdf:RDF>
> </x:xmpmeta>
>