Indexing only “readable/parsable” text from pdf

Croci Francesco Luigi (ID SWS) Fri, 14 Mar 2014 04:13:29 -0700

I have to index a list of PDFs and for some of them there is no problem, but 
for others when I look the indexed content I only see a lot of diamonds with a 
question mark in it.


I think the problem is the font used for the document or that the content is 
"encapsulated" into a picture.

Is there a way to tell tika to extract only the "readable/parsable" text of a 
pdf?

When I query all the documents (with my java application) this is an ex. of 
what I see in the logfile for the content of the problematic files:

    DEBUG org.apache.http.wire -  << " [\n]">
    DEBUG org.apache.http.wire -  << "  
[0xe8]?[0x1]d41d8cd98f00b204e9800998ecf8427e[0xb][0xa4][0xe5][0x81](Diverses[0xe6]=aabhpdtyan3vfsujquccemebqr4m3[0xe7][0x81]?[0xc1][0x4]
 [\n]">
    DEBUG org.apache.http.wire -  << " [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << " E-Mail zur Archivierung [\n]">
    DEBUG org.apache.http.wire -  << " [\n]">
    DEBUG org.apache.http.wire -  << "    [\n]">
    DEBUG org.apache.http.wire -  << " 
[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][\n]">
    DEBUG org.apache.http.wire -  << 
"[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][\n]">
    DEBUG org.apache.http.wire -  << " [\n]">
    DEBUG org.apache.http.wire -  << " 
[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
    DEBUG org.apache.http.wire -  << " [\n]">
    DEBUG org.apache.http.wire -  << " 
[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
    DEBUG org.apache.http.wire -  << 
"[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
    DEBUG org.apache.http.wire -  << " [\n]">
    DEBUG org.apache.http.wire -  << " 
[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
    DEBUG org.apache.http.wire -  << " [\n]">
    DEBUG org.apache.http.wire -  << " 
[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
    DEBUG org.apache.http.wire -  << 
"[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][\n]">
    DEBUG org.apache.http.wire -  << 
"[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
    DEBUG org.apache.http.wire -  << 
"[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
    DEBUG org.apache.http.wire -  << 
"[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][\n]">
    DEBUG org.apache.http.wire -  << 
"[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
    DEBUG org.apache.http.wire -  << " [\n]">
    DEBUG org.apache.http.wire -  << " 
[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
    DEBUG org.apache.http.wire -  << " [\n]">
    DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd] [\n]">
    DEBUG org.apache.http.wire -  << "  [\n]">
    DEBUG org.apache.http.wire -  << " [\n]">
    DEBUG org.apache.http.wire -  << " [0x9] data1.pdf [\n]">



Another problem is that for all the files (also the "good ones") at the 
beginning of the content field there is a long list of `\n` as you can also see 
above. How can avoid this?


Here is my schema.xml:

    <?xml version="1.0" encoding="UTF-8" ?>
    <schema name="simple" version="1.1">
                <types>
                               <fieldtype name="string" class="solr.StrField" 
postingsFormat="SimpleText" />
                               <fieldtype name="ignored" class="solr.TextField" 
/>
                                <fieldtype name="text" class="solr.TextField" 
postingsFormat="SimpleText">
                                               <analyzer>
                                                               <charFilter 
class="solr.PatternReplaceCharFilterFactory" pattern="\n" replacement=""/>
                                                               <tokenizer 
class="solr.StandardTokenizerFactory"/>
                                                               <filter 
class="solr.LowerCaseFilterFactory" /> <!--Lowercases the letters in each 
token. Leaves non-letter tokens alone.-->
                                                               <filter 
class="solr.ClassicFilterFactory" /> <!--Removes dots from acronyms and 's from 
the end of tokens. Works only on typed tokens produced by ClassicTokenizer or 
equivalent.-->
                                                               <filter 
class="solr.TrimFilterFactory"/> <!--Trims whitespace at either end of a token. 
-->
                                                               <filter 
class="solr.StopFilterFactory" ignoreCase="true"/> <!--Discards common words.  
-->
                                                               <filter 
class="solr.RemoveDuplicatesTokenFilterFactory"/>
                                               </analyzer>
                               </fieldtype>
                </types>

                <fields>
                               <field name="signatureField" type="string" 
indexed="true" stored="true" multiValued="false" />
                               <dynamicField name="ignored_*" type="ignored" 
multiValued="true" indexed="false" stored="false" />
                               <field name="id" type="string" indexed="true" 
stored="true" multiValued="false" />
                               <field name="rmDocumentTitle" type="string" 
indexed="true" stored="true" multiValued="true"/>
                               <field name="fullText" indexed="true" 
type="text" multiValued="true" />
                </fields>

                <defaultSearchField>fullText</defaultSearchField>

                <solrQueryParser defaultOperator="OR" />
                <uniqueKey>id</uniqueKey>
    </schema>

and my solrconfig.xml:

    <?xml version="1.0" encoding="UTF-8" ?>
    <config>
                <luceneMatchVersion>LUCENE_45</luceneMatchVersion>
                <directoryFactory name='DirectoryFactory' 
class='solr.MMapDirectoryFactory' />

                <codecFactory name="CodecFactory" 
class="solr.SchemaCodecFactory" />

                <lib dir='${solr.core.instanceDir}\lib' />
                <lib dir="${solr.core.instanceDir}\dist\" 
regex="solr-cell-\d.*\.jar" />
                <lib dir="${solr.core.instanceDir}\contrib\extraction\lib" 
regex=".*\.jar" />

                <requestHandler name="standard" 
class="solr.StandardRequestHandler" default="true" />

                <requestHandler name="/update" 
class="solr.UpdateRequestHandler">
                               <lst name="defaults">
                                               <str 
name="update.chain">deduplication</str>
                               </lst>
                </requestHandler>

                <requestHandler name="/update/extract" 
class="solr.extraction.ExtractingRequestHandler">
                               <lst name="defaults">
                                               <str 
name="captureAttr">true</str>
                                               <str 
name="lowernames">false</str>
                                               <str name="overwrite">false</str>
                                               <str 
name="captureAttr">true</str>
                                               <str 
name="literalsOverride">true</str>
                                               <str 
name="uprefix">ignored_</str>
                                               <str name="fmap.a">link</str>
                                               <str 
name="fmap.content">fullText</str>
                                               <!-- the configuration here 
could be useful for tests -->
                                               <str 
name="update.chain">deduplication</str>
                               </lst>
                </requestHandler>

                <updateRequestProcessorChain name="deduplication">
                               <processor
                                               
class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
                                               <bool 
name="overwriteDupes">false</bool>
                                               <str 
name="signatureField">signatureField</str>
                                               <bool name="enabled">true</bool>
                                               <str name="fields">content</str>
                                               <str name="minTokenLen">10</str>
                                               <str name="quantRate">.2</str>
                                               <str 
name="signatureClass">solr.update.processor.TextProfileSignature</str>
                               </processor>
                               <processor 
class="solr.LogUpdateProcessorFactory" />
                               <processor 
class="solr.RunUpdateProcessorFactory" />
                </updateRequestProcessorChain>

                <requestHandler name="/admin/"
                               
class="org.apache.solr.handler.admin.AdminHandlers" />

                <lockType>none</lockType>

                <admin>
                               <defaultQuery>*:*</defaultQuery>
                </admin>

    </config>

Indexing only “readable/parsable” text from pdf

Reply via email to