Try something like this:

<properties>
  <parsers>
    <parser class="org.apache.tika.parser.DefaultParser">
      <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
    </parser>
    <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
      <params>
        <param name="timeoutSeconds" type="int">180</param>
      </params>
    </parser>
  </parsers>

  <server stuff../>

</properties>


On Tue, Jul 26, 2022 at 6:52 AM PGNet Dev <[email protected]> wrote:

> removing dovecot from the equation, reduced this to just tika,
> reproducible here
>
> running
>
>         ls -al /srv/tika/tika-server.jar
>                 lrwxrwxrwx 1 root root 50 Jul 26 05:42
> /srv/tika/tika-server.jar ->
> tika-server-standard-2.4.2-20220725.215245-121.jar
>
>         systemctl status tika -ln0
>                 ● tika.service - Apache Tika server
>                      Loaded: loaded (/etc/systemd/system/tika.service;
> enabled; vendor preset: disabled)
>                      Active: active (running) since Tue 2022-07-26
> 05:43:01 EDT; 29min ago
>                    Main PID: 10829 (java)
>                       Tasks: 53 (limit: 8812)
>                      Memory: 215.9M
>                         CPU: 37.667s
>                      CGroup: /system.slice/tika.service
>                              ├─ 10829 /usr/bin/java
> -Dpdfbox.fontcache=/var/tika -XX:ParallelGCThreads=1 -XX:CICompilerCount=2
> -XX:-CICompilerCountPerCPU -jar /srv/tika/tika-server.jar -c
> /etc/tika/tika-server-config-custom.xml --host 127.0.0.1 --port 9998
>                              └─ 10863 /usr/bin/java -Xms1g -Xmx1g
> -Dpdfbox.fontcache=/var/tika -Dlog4j2.debug -Djava.awt.headless=true -cp
> /srv/tika/tika-server.jar -Dtika.server.id=
> org.apache.tika.server.core.TikaServerProcess -h 127.0.0.1 -p 9998 -i "" -c
> /etc/tika/tika-server-config-custom.xml -forkedStatusFile
> /tmp/apache-tika-server-forked-tmp-12945021525641519393 -numRestarts 0
>
> on
>
>         lsb_release -rd
>                 Description:    Fedora release 36 (Thirty Six)
>                 Release:        36
>
> with
>
>         tesseract --version
>                 tesseract 5.0.1
>                  leptonica-1.82.0
>                   libgif 5.2.1 : libjpeg 6b (libjpeg-turbo 2.1.2) : libpng
> 1.6.37 : libtiff 4.4.0 : zlib 1.2.11 : libwebp 1.2.3
>                  Found OpenMP 201511
>
>         stream --version
>                 Version: ImageMagick 7.1.0-44 Q16-HDRI x86_64 20294
> https://imagemagick.org
>                 Copyright: (C) 1999 ImageMagick Studio LLC
>                 License: https://imagemagick.org/script/license.php
>                 Features: Cipher DPC HDRI Modules OpenMP(4.5)
>                 Delegates (built-in): bzlib cairo djvu fontconfig freetype
> gslib gvc heic jbig jng jp2 jpeg lcms lqr ltdl lzma openexr pangocairo png
> ps raqm raw rsvg tiff webp wmf x xml zip zlib
>                 Compiler: gcc (12.1)
>
>         java -version
>                 Picked up JAVA_TOOL_OPTIONS: -Xmx512M
>                 openjdk version "18.0.1.1" 2022-04-22
>                 OpenJDK Runtime Environment 22.3 (build 18.0.1.1+2)
>                 OpenJDK 64-Bit Server VM 22.3 (build 18.0.1.1+2, mixed
> mode, sharing)
>
> & custom config
>
>         cat /etc/tika/tika-server-config-custom.xml
>                 <?xml version="1.0" encoding="UTF-8"?>
>                 <properties>
>                   <parsers>
>                   </parsers>
>                   <server>
>                     <params>
>                       <logLevel>debug</logLevel>
>                       <javaPath>/usr/bin/java</javaPath>
>                       <noFork>false</noFork>
>                       <forkedJvmArgs>
>                         <arg>-Xms1g</arg>
>                         <arg>-Xmx1g</arg>
>                         <arg>-Dpdfbox.fontcache=/var/tika</arg>
>                         <arg>-Dlog4j2.debug</arg>
>                       </forkedJvmArgs>
>                       <digest>sha256</digest>
>
> <enableUnsecureFeatures>false</enableUnsecureFeatures>
>                       <id></id>
>                       <maxFiles>100000</maxFiles>
>
> <maxForkedStartupMillis>120000</maxForkedStartupMillis>
>                       <maxRestarts>-1</maxRestarts>
>                       <minimumTimeoutMillis>30000</minimumTimeoutMillis>
>                       <returnStackTrace>false</returnStackTrace>
>                       <taskPulseMillis>10000</taskPulseMillis>
>                       <taskTimeoutMillis>300000</taskTimeoutMillis>
>                       <endpoints>
>                         <endpoint>tika</endpoint>
>                         <endpoint>status</endpoint>
>                         <endpoint>rmeta</endpoint>
>                       </endpoints>
>                     </params>
>                   </server>
>                 </properties>
>
> on exec, passing a test pdf,
>
>         curl -T ~/Get_Started_With_Smallpdf.pdf http://127.0.0.1:9998/meta
>
> complete metadata's returned
>
>                 <x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP
> Core Test.SNAPSHOT">
>                   <rdf:RDF xmlns:rdf="
> http://www.w3.org/1999/02/22-rdf-syntax-ns#";>
>                     <rdf:Description rdf:about=""
>                         xmlns:pdf="http://ns.adobe.com/pdf/1.3/";
>                         xmlns:xmp="http://ns.adobe.com/xap/1.0/";
>                         xmlns:dc="http://purl.org/dc/elements/1.1/";
>                         xmlns:xmpMM="http://ns.adobe.com/xap/1.0/mm/";
>                         xmlns:xmpTPg="http://ns.adobe.com/xap/1.0/t/pg/";
>                       pdf:PDFVersion="1.7"
>                       pdf:hasXFA="false"
>                       pdf:hasCollection="false"
>                       pdf:encrypted="false"
>                       pdf:hasMarkedContent="false"
>                       pdf:producer="Adobe PDF Library 15.0"
>                       pdf:hasXMP="true"
>                       xmp:CreatorTool="Adobe InDesign 15.1 (Macintosh)"
>                       xmp:CreateDate="2020-10-14T17:08:10Z"
>                       xmp:ModifyDate="2020-10-14T17:08:10Z"
>                       xmp:MetadataDate="2020-10-14T17:08:10Z"
>                       dc:format="application/pdf; version=1.7"
>                       dc:language="en-US"
>                       xmpMM:DocumentID="xmp.id:7
> a865d84-8dbf-4015-96b7-fdae89a9603b"
>                       xmpTPg:NPages="1">
>                       <pdf:unmappedUnicodeCharsPerPage>
>                         <rdf:Seq>
>                           <rdf:li>0</rdf:li>
>                         </rdf:Seq>
>                       </pdf:unmappedUnicodeCharsPerPage>
>                       <pdf:charsPerPage>
>                         <rdf:Seq>
>                           <rdf:li>794</rdf:li>
>                         </rdf:Seq>
>                       </pdf:charsPerPage>
>                       <pdf:annotationTypes>
>                         <rdf:Bag>
>
> <rdf:li>95e8dd6e9b4c5a3d-3d44cd989a3a348c</rdf:li>
>
> <rdf:li>95e8dd6f9b4c5a3e-3d44cd979a3a348b</rdf:li>
>
> <rdf:li>95e8dd709b4c5a3f-3d44cd969a3a348a</rdf:li>
>
> <rdf:li>95e8dd719b4c5a40-3d44cd959a3a3489</rdf:li>
>
> <rdf:li>95e8dd729b4c5a41-3d44cd949a3a3488</rdf:li>
>                         </rdf:Bag>
>                       </pdf:annotationTypes>
>                       <pdf:annotationSubtypes>
>                         <rdf:Bag>
>                           <rdf:li>Link</rdf:li>
>                         </rdf:Bag>
>                       </pdf:annotationSubtypes>
>                     </rdf:Description>
>                   </rdf:RDF>
>                 </x:xmpmeta>
>
> if i add TesseractOCRParser class config to the above, for simple param
> override
>
>         cat /etc/tika/tika-server-config-custom.xml
>                 <?xml version="1.0" encoding="UTF-8"?>
>                 <properties>
>                   <parsers>
> +                       <parser
> class="org.apache.tika.parser.ocr.TesseractOCRParser">
> +                         <params>
> +                           <param name="timeoutSeconds"
> type="int">180</param>
> +                         </params>
> +                       </parser>
>                   </parsers>
>                 ...
>
> exec
>
>         systemctl restart tika
>         curl -T ~/Get_Started_With_Smallpdf.pdf http://127.0.0.1:9998/meta
>
> returns incomplete/truncated data
>
>                 <x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP
> Core Test.SNAPSHOT">
>                   <rdf:RDF xmlns:rdf="
> http://www.w3.org/1999/02/22-rdf-syntax-ns#";>
>                     <rdf:Description rdf:about=""/>
>                   </rdf:RDF>
>                 </x:xmpmeta>
>

Reply via email to