Author: siren Date: Fri Feb 27 06:21:37 2009 New Revision: 748408 URL: http://svn.apache.org/viewvc?rev=748408&view=rev Log: NUTCH-699 - Add an "official" solr schema for solr integration. Contributed by dogacan, Dmitry Lihachev
Added: lucene/nutch/trunk/conf/schema.xml Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=748408&r1=748407&r2=748408&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Feb 27 06:21:37 2009 @@ -361,6 +361,9 @@ 135. NUTCH-698 - CrawlDb is corrupted after a few crawl cycles (dogacan via siren) + +136. NUTCH-699 - Add an "official" solr schema for solr integration (dogacan, + Dmitry Lihachev via siren) Release 0.9 - 2007-04-02 Added: lucene/nutch/trunk/conf/schema.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/schema.xml?rev=748408&view=auto ============================================================================== --- lucene/nutch/trunk/conf/schema.xml (added) +++ lucene/nutch/trunk/conf/schema.xml Fri Feb 27 06:21:37 2009 @@ -0,0 +1,109 @@ +<?xml version="1.0" encoding="UTF-8" ?> + <!-- + Licensed to the Apache Software Foundation (ASF) under one or + more contributor license agreements. See the NOTICE file + distributed with this work for additional information regarding + copyright ownership. The ASF licenses this file to You under the + Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain + a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 Unless required by + applicable law or agreed to in writing, software distributed + under the License is distributed on an "AS IS" BASIS, WITHOUT + WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions + and limitations under the License. + --> + <!-- + Description: This document contains solr schema definition to be + used with solr integration currently build into Nutch. See + https://issues.apache.org/jira/browse/NUTCH-442 + https://issues.apache.org/jira/browse/NUTCH-699 for more info. + --> +<schema name="nutch" version="1.1"> + <types> + <fieldType name="string" class="solr.StrField" + sortMissingLast="true" omitNorms="true"/> + <fieldType name="long" class="solr.LongField" + omitNorms="true"/> + <fieldType name="float" class="solr.FloatField" + omitNorms="true"/> + <fieldType name="text" class="solr.TextField" + positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.StopFilterFactory" + ignoreCase="true" words="stopwords.txt"/> + <filter class="solr.WordDelimiterFilterFactory" + generateWordParts="1" generateNumberParts="1" + catenateWords="1" catenateNumbers="1" catenateAll="0" + splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.EnglishPorterFilterFactory" + protected="protwords.txt"/> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + </fieldType> + <fieldType name="url" class="solr.TextField" + positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.WordDelimiterFilterFactory" + generateWordParts="1" generateNumberParts="1"/> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + </fieldType> + </types> + <fields> + <field name="id" type="string" stored="true" indexed="true"/> + + <!-- core fields --> + <field name="segment" type="string" stored="true" indexed="false"/> + <field name="digest" type="string" stored="true" indexed="false"/> + <field name="boost" type="float" stored="true" indexed="false"/> + + <!-- fields for index-basic plugin --> + <field name="host" type="url" stored="false" indexed="true"/> + <field name="site" type="string" stored="false" indexed="true"/> + <field name="url" type="url" stored="true" indexed="true" + required="true"/> + <field name="content" type="text" stored="false" indexed="true"/> + <field name="title" type="text" stored="true" indexed="true"/> + <field name="cache" type="string" stored="true" indexed="false"/> + <field name="tstamp" type="long" stored="true" indexed="false"/> + + <!-- fields for index-anchor plugin --> + <field name="anchor" type="string" stored="true" indexed="true" + multiValued="true"/> + + <!-- fields for index-more plugin --> + <field name="type" type="string" stored="true" indexed="true" + multiValued="true"/> + <field name="contentLength" type="long" stored="true" + indexed="false"/> + <field name="lastModified" type="long" stored="true" + indexed="false"/> + <field name="date" type="string" stored="true" indexed="true"/> + + <!-- fields for languageidentifier plugin --> + <field name="lang" type="string" stored="true" indexed="true"/> + + <!-- fields for subcollection plugin --> + <field name="subcollection" type="string" stored="true" + indexed="true"/> + + <!-- fields for feed plugin --> + <field name="author" type="string" stored="true" indexed="true"/> + <field name="tag" type="string" stored="true" indexed="true"/> + <field name="feed" type="string" stored="true" indexed="true"/> + <field name="publishedDate" type="string" stored="true" + indexed="true"/> + <field name="updatedDate" type="string" stored="true" + indexed="true"/> + </fields> + <uniqueKey>id</uniqueKey> + <defaultSearchField>content</defaultSearchField> + <solrQueryParser defaultOperator="OR"/> + <copyField source="url" dest="id"/> +</schema> \ No newline at end of file