I believe thats because nutch has a property metatag.keywords that its
trying to send to solr.
Solr then complains that it doesnt know where to put it, because its not
defined in fields.

One option that worked for me was defining a dynamic field, which catches
all but doesn't store or index them (effectivly ignoring fields that arent
defined).

Pop this inside the fields tag and see if it helps.
<dynamicField name="*" type="string" indexed="false" stored="false"
multiValued="false" />

On Wed, Apr 8, 2015 at 1:27 PM, Anchit Jain <[email protected]>
wrote:

> I have crawled a website using nutch.
> When I try to index it with solr I get following error
> org.apache.solr.common.SolrException: ERROR: [doc=http://xyz.htm] unknown
> field 'metatag.keywords'
> *unknown field 'metatag.keywords'*
>
> I can not figure out where the error is as I have o not defined any field
> in schema.xml for metatags.I just copied the schema.xml from nutch into
> solr.
> I am using Nutch 1.9 with Solr 4.10
>
> My *schema.xml* for *solr*
>
> <?xml version="1.0" encoding="UTF-8" ?>
> <schema name="nutch" version="1.5">
>     <types>
>         <fieldType name="string" class="solr.StrField"
> sortMissingLast="true"
>             omitNorms="true"/>
>         <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
>             omitNorms="true" positionIncrementGap="0"/>
>         <fieldType name="float" class="solr.TrieFloatField"
> precisionStep="0"
>             omitNorms="true" positionIncrementGap="0"/>
>         <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
>             omitNorms="true" positionIncrementGap="0"/>
>
>         <fieldType name="text" class="solr.TextField"
>             positionIncrementGap="100">
>             <analyzer>
>                 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>                 <filter class="solr.StopFilterFactory"
>                     ignoreCase="true" words="stopwords.txt"/>
>                 <filter class="solr.WordDelimiterFilterFactory"
>                     generateWordParts="1" generateNumberParts="1"
>                     catenateWords="1" catenateNumbers="1" catenateAll="0"
>                     splitOnCaseChange="1"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <!--<filter class="solr.EnglishPorterFilterFactory"
>                     protected="protwords.txt"/>-->
>                 <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
>             </analyzer>
>         </fieldType>
>         <fieldType name="url" class="solr.TextField"
>             positionIncrementGap="100">
>             <analyzer>
>                 <tokenizer class="solr.StandardTokenizerFactory"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <filter class="solr.WordDelimiterFilterFactory"
>                     generateWordParts="1" generateNumberParts="1"/>
>             </analyzer>
>         </fieldType>
>     </types>
>     <fields>
>         <field name="id" type="string" stored="true" indexed="true"
>             required="true"/>
>         <field name="_version_" type="long" indexed="true" stored="true"/>
>         <!-- core fields -->
>         <field name="segment" type="string" stored="true" indexed="false"/>
>         <field name="digest" type="string" stored="true" indexed="false"/>
>         <field name="boost" type="float" stored="true" indexed="false"/>
>
>         <!-- fields for index-basic plugin -->
>         <field name="host" type="string" stored="false" indexed="true"/>
>         <field name="url" type="url" stored="true" indexed="true"/>
>         <field name="content" type="text" stored="true" indexed="true"/>
>         <field name="title" type="text" stored="true" indexed="true"/>
>         <field name="cache" type="string" stored="true" indexed="false"/>
>         <field name="tstamp" type="date" stored="true" indexed="false"/>
>
>         <!-- fields for index-anchor plugin -->
>         <field name="anchor" type="string" stored="true" indexed="true"
>             multiValued="true"/>
>
>         <!-- fields for index-more plugin -->
>         <field name="type" type="string" stored="true" indexed="true"
>             multiValued="true"/>
>         <field name="contentLength" type="long" stored="true"
>             indexed="false"/>
>         <field name="lastModified" type="date" stored="true"
>             indexed="false"/>
>         <field name="date" type="date" stored="true" indexed="true"/>
>
>         <!-- fields for languageidentifier plugin -->
>         <field name="lang" type="string" stored="true" indexed="true"/>
>
>         <!-- fields for subcollection plugin -->
>         <field name="subcollection" type="string" stored="true"
>             indexed="true" multiValued="true"/>
>
>         <!-- fields for feed plugin (tag is also used by
> microformats-reltag)-->
>         <field name="author" type="string" stored="true" indexed="true"/>
>         <field name="tag" type="string" stored="true" indexed="true"
> multiValued="true"/>
>         <field name="feed" type="string" stored="true" indexed="true"/>
>         <field name="publishedDate" type="date" stored="true"
>             indexed="true"/>
>         <field name="updatedDate" type="date" stored="true"
>             indexed="true"/>
>
>         <!-- fields for creativecommons plugin -->
>         <field name="cc" type="string" stored="true" indexed="true"
>             multiValued="true"/>
>
>         <!-- fields for tld plugin -->
>         <field name="tld" type="string" stored="false" indexed="false"/>
>     </fields>
>     <uniqueKey>id</uniqueKey>
>     <defaultSearchField>content</defaultSearchField>
>     <solrQueryParser defaultOperator="OR"/>
> </schema>
>
> my *solrindex-mapping.xml*
>
> <mapping>
>     <fields>
>         <field dest="content" source="content"/>
>         <field dest="title" source="title"/>
>         <field dest="host" source="host"/>
>         <field dest="segment" source="segment"/>
>         <field dest="boost" source="boost"/>
>         <field dest="digest" source="digest"/>
>         <field dest="tstamp" source="tstamp"/>
>     </fields>
>     <uniqueKey>id</uniqueKey>
> </mapping>
>
>
>

Reply via email to