I solved with tika 3.0 and this tika-config.xml:

<?xml version="1.0" encoding="UTF-8"?>
<properties>
  <server>
    <taskTimeoutMillis>120000</taskTimeoutMillis>
    <minimumTimeoutMillis>10</minimumTimeoutMillis>
    <port>9998</port>
    <maxFiles>20000</maxFiles>
    <forkedJvmArgs>
      <arg>-Xmx512m</arg>
    </forkedJvmArgs>
  </server>

  <service-loader dynamic="true"/>
  <service-loader loadErrorHandler="WARN"/>

  <parsers>
    <parser class="org.apache.tika.parser.DefaultParser">
      <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
      <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
      <params>
        <param name="byteArrayMaxOverride" type="int">30000000</param>
        <param name="suppressExceptions" type="bool">true</param>
        <param name="ignoreTikaErrors" type="bool">true</param>
      </params>
    </parser>

    <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
      <params>
        <param name="includeShapeBasedContent" type="bool">false</param>
        <param name="suppressExceptions" type="bool">true</param>
      </params>
    </parser>

    <parser class="org.apache.tika.parser.pdf.PDFParser">
      <params>
        <param name="pdfbox.enableAutoSpace" type="bool">true</param>
        <param name="suppressExceptions" type="bool">true</param>
      </params>
    </parser>
  </parsers>
</properties>




Thanks alot


Mario

Reply via email to