I searched a way to index only the content/text part of a PDF (without all the
other fields Tika creates) and I found the "solution" with the "uprefix" =
ignored_ and <dynamicField name="ignored_*" type="ignored" multiValued="true"
indexed="false" stored="false" />.
The problem is, that uprefix works on fields that are not specified in the
schema. In my schema I specified two fields (id and rmDocumentTitle) and this
two fields are added to the content too (what I will avoid).
How can I exclude this two fields to be added to the fullText?
Here are my config files:
schema.xml
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="simple" version="1.1">
<types>
<fieldtype name="string" class="solr.StrField"
postingsFormat="SimpleText" />
<fieldtype name="ignored" class="solr.TextField"
/>
<fieldtype name="text" class="solr.TextField"
postingsFormat="SimpleText">
<analyzer type="index">
<tokenizer
class="solr.StandardTokenizerFactory"/>
<!--<filter
class="solr.ASCIIFoldingFilterFactory"/>--> <!--Converts alphabetic, numeric,
and symbolic Unicode characters which are not in the first 127 ASCII characters
into their ASCII equivalents, if one exists. -->
<filter
class="solr.LowerCaseFilterFactory" /> <!--Lowercases the letters in each
token. Leaves non-letter tokens alone.-->
<filter
class="solr.TrimFilterFactory"/> <!--Trims whitespace at either end of a token.
-->
<filter
class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<!--Discards common words. -->
<filter
class="solr.PorterStemFilterFactory"/>
<!--<filter
class="solr.SnowballPorterFilterFactory" language="German2" /> -->
<filter
class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer
class="solr.StandardTokenizerFactory"/>
<filter
class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter
class="solr.LowerCaseFilterFactory" />
<filter
class="solr.TrimFilterFactory"/>
<filter
class="solr.PorterStemFilterFactory"/>
<!--<filter
class="solr.SnowballPorterFilterFactory" language="German2" /> -->
<filter
class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldtype>
</types>
<fields>
<field name="signatureField" type="string"
indexed="true" stored="true" multiValued="false" />
<dynamicField name="ignored_*" type="ignored"
multiValued="true" indexed="false" stored="false" />
<field name="id" type="string" indexed="true"
stored="true" multiValued="false" />
<field name="rmDocumentTitle" type="string"
indexed="true" stored="true" multiValued="true"/>
<field name="fullText" indexed="true"
type="text" multiValued="true" />
</fields>
<defaultSearchField>fullText</defaultSearchField>
<solrQueryParser defaultOperator="OR" />
<uniqueKey>id</uniqueKey>
</schema>
solrconfig.xml
<?xml version="1.0" encoding="UTF-8" ?>
<config>
...
<requestHandler name="/update/extract"
class="solr.extraction.ExtractingRequestHandler">
<lst name="defaults">
<str
name="captureAttr">true</str>
<str
name="lowernames">false</str>
<str name="overwrite">false</str>
<str
name="captureAttr">true</str>
<str
name="literalsOverride">true</str>
<str
name="uprefix">ignored_</str>
<str name="fmap.a">link</str>
<str
name="fmap.content">fullText</str>
<!-- the configuration here
could be useful for tests -->
<str
name="update.chain">deduplication</str>
</lst>
</requestHandler>
<updateRequestProcessorChain name="deduplication">
<processor
class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
<bool
name="overwriteDupes">false</bool>
<str
name="signatureField">signatureField</str>
<bool name="enabled">true</bool>
<str name="fields">content</str>
<str name="minTokenLen">10</str>
<str name="quantRate">.2</str>
<str
name="signatureClass">solr.update.processor.TextProfileSignature</str>
</processor>
<processor
class="solr.LogUpdateProcessorFactory" />
<processor
class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
<requestHandler name="/admin/"
class="org.apache.solr.handler.admin.AdminHandlers" />
<lockType>none</lockType>
<admin>
<defaultQuery>*:*</defaultQuery>
</admin>
</config>
Thank you for any help.
Francesco