Author: lewismc
Date: Tue Aug 18 21:19:07 2015
New Revision: 1696506
URL: http://svn.apache.org/r1696506
Log:
NUTCH-1486 Upgrade to Solr 4.10.2
Added:
nutch/trunk/src/plugin/index-geoip/build-ivy.xml
- copied, changed from r1693938,
nutch/trunk/src/plugin/parse-tika/build-ivy.xml
nutch/trunk/src/plugin/indexer-solr/build-ivy.xml
nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml
- copied, changed from r1693468,
nutch/trunk/src/plugin/parse-tika/build-ivy.xml
Removed:
nutch/trunk/conf/schema-solr4.xml
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/log4j.properties
nutch/trunk/conf/nutch-default.xml
nutch/trunk/conf/schema.xml
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
nutch/trunk/src/plugin/index-geoip/ivy.xml
nutch/trunk/src/plugin/index-geoip/plugin.xml
nutch/trunk/src/plugin/indexer-solr/ivy.xml
nutch/trunk/src/plugin/indexer-solr/plugin.xml
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
nutch/trunk/src/plugin/parse-tika/ivy.xml
nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml
nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Aug 18 21:19:07 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-1486 Upgrade to Solr 4.10.2 (lewismc, markus)
+
* NUTCH-2048 parse-tika: fix dependencies in plugin.xml (Michael Joyce via
snagel)
* NUTCH-2066 Parameterize Generate REST endpoint (Sujen Shah via mattmann)
Modified: nutch/trunk/conf/log4j.properties
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/conf/log4j.properties (original)
+++ nutch/trunk/conf/log4j.properties Tue Aug 18 21:19:07 2015
@@ -44,10 +44,8 @@ log4j.logger.org.apache.nutch.crawl.Craw
log4j.logger.org.apache.nutch.crawl.LinkDb=INFO,cmdstdout
log4j.logger.org.apache.nutch.crawl.LinkDbMerger=INFO,cmdstdout
log4j.logger.org.apache.nutch.indexer.IndexingJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrIndexer=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrWriter=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrDeleteDuplicates=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrClean=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexwriter.solr.SolrIndexWriter=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexwriter.solr.SolrUtils-INFO,cmdstdout
log4j.logger.org.apache.nutch.scoring.webgraph.WebGraph=INFO,cmdstdout
log4j.logger.org.apache.nutch.scoring.webgraph.LinkRank=INFO,cmdstdout
log4j.logger.org.apache.nutch.scoring.webgraph.Loops=INFO,cmdstdout
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Aug 18 21:19:07 2015
@@ -1615,6 +1615,18 @@ CAUTION: Set the parser.timeout to -1 or
</property>
<!-- solr index properties -->
+
+<property>
+ <name>solr.server.type</name>
+ <value>http</value>
+ <description>
+ Specifies the SolrServer implementation to use. This is a string value
+ of one of the following 'cloud', 'concurrent', 'http' or 'lb'.
+ The values represent CloudSolrServer, ConcurrentUpdateSolrServer,
+ HttpSolrServer or LBHttpSolrServer respectively.
+ </description>
+</property>
+
<property>
<name>solr.server.url</name>
<value>http://127.0.0.1:8983/solr/</value>
@@ -1624,6 +1636,25 @@ CAUTION: Set the parser.timeout to -1 or
</description>
</property>
+<property>
+ <name>solr.zookeeper.url</name>
+ <value></value>
+ <description>
+ Defines the Zookeeper URL which is an essential setting to be used
+ when using SolrCloud. This should be a fully qualified URL similar to
+ the property provided within 'solr.server.url' above.
+ </description>
+</property>
+
+<property>
+ <name>solr.loadbalance.urls</name>
+ <value></value>
+ <description>
+ A comma-seperated value representing the Solr servers to be used when
+ initiating LBHttpSolrServer as the SolrServer implementation.
+ </description>
+</property>
+
<property>
<name>solr.mapping.file</name>
<value>solrindex-mapping.xml</value>
Modified: nutch/trunk/conf/schema.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/conf/schema.xml (original)
+++ nutch/trunk/conf/schema.xml Tue Aug 18 21:19:07 2015
@@ -1,72 +1,308 @@
<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ Description: This document contains Solr 4.x schema definition to
+ be used with Solr integration currently built into Nutch.
+ This schema is not minimal, there are some useful field type definitions
left,
+ and the set of fields and their flags (indexed/stored/term vectors) can be
+ further optimized depending on needs. See
+
http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup
+ for more info.
+-->
+
+<schema name="nutch" version="1.5">
+
+ <types>
+
+ <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+ <fieldType name="string" class="solr.StrField" sortMissingLast="true"
omitNorms="true"/>
+
+ <fieldtype name="binary" class="solr.BinaryField"/>
+
+
<!--
- Licensed to the Apache Software Foundation (ASF) under one or
- more contributor license agreements. See the NOTICE file
- distributed with this work for additional information regarding
- copyright ownership. The ASF licenses this file to You under the
- Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain
- a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0 Unless required by
- applicable law or agreed to in writing, software distributed
- under the License is distributed on an "AS IS" BASIS, WITHOUT
- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions
- and limitations under the License.
+ Default numeric field types. For faster range queries, consider the
tint/tfloat/tlong/tdouble types.
-->
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0"
omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"
omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0"
omitNorms="true" positionIncrementGap="0"/>
+
<!--
- Description: This document contains Solr 3.1 schema definition to
- be used with Solr integration currently build into Nutch. See
- https://issues.apache.org/jira/browse/NUTCH-442
- https://issues.apache.org/jira/browse/NUTCH-699
- https://issues.apache.org/jira/browse/NUTCH-994
- https://issues.apache.org/jira/browse/NUTCH-997
- https://issues.apache.org/jira/browse/NUTCH-1058
- https://issues.apache.org/jira/browse/NUTCH-1232
- and
- http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/
- example/solr/conf/schema.xml?view=markup
- for more info.
+ Numeric field types that index each value at various levels of precision
+ to accelerate range queries when the number of values between the range
+ endpoints is large. See the javadoc for NumericRangeQuery for internal
+ implementation details.
+
+ Smaller precisionStep values (specified in bits) will lead to more tokens
+ indexed per value, slightly larger index size, and faster range queries.
+ A precisionStep of 0 disables indexing at different precision levels.
-->
-<schema name="nutch" version="1.5">
- <types>
- <fieldType name="string" class="solr.StrField" sortMissingLast="true"
- omitNorms="true"/>
- <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
- omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"
- omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
- omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="location" class="solr.LatLonType"
subFieldSuffix="_coordinate"/>
- <fieldtype name="binary" class="solr.BinaryField"/>
-
- <fieldType name="text" class="solr.TextField"
- positionIncrementGap="100">
- <analyzer>
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.StopFilterFactory"
- ignoreCase="true" words="stopwords.txt"/>
- <filter class="solr.WordDelimiterFilterFactory"
- generateWordParts="1" generateNumberParts="1"
- catenateWords="1" catenateNumbers="1" catenateAll="0"
- splitOnCaseChange="1"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.EnglishPorterFilterFactory"
- protected="protwords.txt"/>
- <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="url" class="solr.TextField"
- positionIncrementGap="100">
- <analyzer>
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.WordDelimiterFilterFactory"
- generateWordParts="1" generateNumberParts="1"/>
- </analyzer>
- </fieldType>
- <!-- boolean type: "true" or "false" -->
+ <fieldType name="tint" class="solr.TrieIntField" precisionStep="8"
omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8"
omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8"
omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8"
omitNorms="true" positionIncrementGap="0"/>
+
+ <!-- The format for this date field is of the form 1995-12-31T23:59:59Z,
and
+ is a more restricted form of the canonical representation of dateTime
+ http://www.w3.org/TR/xmlschema-2/#dateTime
+ The trailing "Z" designates UTC time and is mandatory.
+ Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
+ All other components are mandatory.
+
+ Expressions can also be used to denote calculations that should be
+ performed relative to "NOW" to determine the value, ie...
+
+ NOW/HOUR
+ ... Round to the start of the current hour
+ NOW-1DAY
+ ... Exactly 1 day prior to now
+ NOW/DAY+6MONTHS+3DAYS
+ ... 6 months and 3 days in the future from the start of
+ the current day
+
+ Consult the DateField javadocs for more information.
+
+ Note: For faster range queries, consider the tdate type
+ -->
+ <fieldType name="date" class="solr.TrieDateField" omitNorms="true"
precisionStep="0" positionIncrementGap="0"/>
+
+ <fieldType name="location" class="solr.LatLonType"
subFieldSuffix="_coordinate"/>
+
+ <!-- A Trie based date field for faster date range queries and date
faceting. -->
+ <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true"
precisionStep="6" positionIncrementGap="0"/>
+
+
+ <!-- solr.TextField allows the specification of custom text analyzers
+ specified as a tokenizer and a list of token filters. Different
+ analyzers may be specified for indexing and querying.
+
+ The optional positionIncrementGap puts space between multiple fields
of
+ this type on the same document, with the purpose of preventing false
phrase
+ matching across fields.
+
+ For more info on customizing your analyzer chain, please see
+ http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
+ -->
+
+ <!-- A general text field that has reasonable, generic
+ cross-language defaults: it tokenizes with StandardTokenizer,
+ removes stop words from case-insensitive "stopwords.txt"
+ (empty by default), and down cases. At query time only, it
+ also applies synonyms. -->
+ <fieldType name="text_general" class="solr.TextField"
positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory"
synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A text field with defaults appropriate for English: it
+ tokenizes with StandardTokenizer, removes English stop words
+ (stopwords.txt), down cases, protects words from protwords.txt, and
+ finally applies Porter's stemming. The query time analyzer
+ also applies synonyms from synonyms.txt. -->
+ <fieldType name="text_en" class="solr.TextField"
positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory"
synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishPossessiveFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory"
protected="protwords.txt"/>
+ <!-- Optionally you may want to use this less aggressive stemmer
instead of PorterStemFilterFactory:
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ -->
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishPossessiveFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory"
protected="protwords.txt"/>
+ <!-- Optionally you may want to use this less aggressive stemmer
instead of PorterStemFilterFactory:
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ -->
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A text field with defaults appropriate for English, plus
+ aggressive word-splitting and autophrase features enabled.
+ This field is just like text_en, except it adds
+ WordDelimiterFilter to enable splitting and matching of
+ words on case-change, alpha numeric boundaries, and
+ non-alphanumeric chars. This means certain compound word
+ cases will work, for example query "wi fi" will match
+ document "WiFi" or "wi-fi". However, other cases will still
+ not match, for example if the query is "wifi" and the
+ document is "wi fi" or if the query is "wi-fi" and the
+ document is "wifi".
+ -->
+ <fieldType name="text_en_splitting" class="solr.TextField"
positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory"
synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"
splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory"
protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"
splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory"
protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Less flexible matching, but less false matches. Probably not ideal
for product names,
+ but may be good for SKUs. Can insert dashes in the wrong place and
still match. -->
+ <fieldType name="text_en_splitting_tight" class="solr.TextField"
positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0"
generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory"
protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the
same position - sometimes
+ possible with WordDelimiterFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Just like text_general except it reverses the characters of
+ each token, to enable more efficient leading wildcard queries. -->
+ <fieldType name="text_general_rev" class="solr.TextField"
positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+ maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldtype name="phonetic" stored="false" indexed="true"
class="solr.TextField" >
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="payloads" stored="false" indexed="true"
class="solr.TextField" >
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!--
+ The DelimitedPayloadTokenFilter can put payloads on tokens... for
example,
+ a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f
+ Attributes of the DelimitedPayloadTokenFilterFactory :
+ "delimiter" - a one character delimiter. Default is | (pipe)
+ "encoder" - how to encode the following value into a playload
+ float -> org.apache.lucene.analysis.payloads.FloatEncoder,
+ integer -> o.a.l.a.p.IntegerEncoder
+ identity -> o.a.l.a.p.IdentityEncoder
+ Fully Qualified class name implementing PayloadEncoder, Encoder
must have a no arg constructor.
+ -->
+ <filter class="solr.DelimitedPayloadTokenFilterFactory"
encoder="float"/>
+ </analyzer>
+ </fieldtype>
+
+ <!-- lowercases the entire field value, keeping it as a single token. -->
+ <fieldType name="lowercase" class="solr.TextField"
positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.KeywordTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory" />
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="url" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1"/>
+ </analyzer>
+ </fieldType>
+
+
+ <fieldType name="text_path" class="solr.TextField"
positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- since fields of this type are by default not stored or indexed,
+ any data added to them will be ignored outright. -->
+ <fieldtype name="ignored" stored="false" indexed="false"
multiValued="true" class="solr.StrField" />
+
+ <!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField"
sortMissingLast="true"/>
<!-- sortMissingLast and sortMissingFirst attributes are optional
attributes are
@@ -83,120 +319,112 @@
- If sortMissingLast="false" and sortMissingFirst="false" (the default),
then default lucene sorting will be used which places docs without the
field first in an ascending sort and last in a descending sort.
- -->
- </types>
- <fields>
- <field name="id" type="string" stored="true" indexed="true"
- required="true"/>
-
- <field name="text" type="text" stored="false" indexed="true"
multiValued="true"/>
-
- <!-- core fields -->
- <field name="segment" type="string" stored="true" indexed="false"/>
- <field name="digest" type="string" stored="true" indexed="false"/>
- <field name="boost" type="float" stored="true" indexed="false"/>
-
- <!-- fields for index-basic plugin -->
- <field name="host" type="string" stored="false" indexed="true"/>
- <field name="url" type="url" stored="true" indexed="true"/>
- <field name="content" type="text" stored="false" indexed="true"/>
- <field name="title" type="text" stored="true" indexed="true"/>
- <field name="cache" type="string" stored="true" indexed="false"/>
- <field name="tstamp" type="date" stored="true" indexed="false"/>
-
- <!-- fields for index-geoip plugin -->
- <field name="ip" type="string" stored="true" indexed="true" />
- <field name="cityName" type="string" stored="true" indexed="true" />
- <field name="cityConfidence" type="int" stored="true" indexed="true" />
- <field name="cityGeoNameId" type="int" stored="true" indexed="true" />
- <field name="continentCode" type="string" stored="true" indexed="true"
/>
- <field name="continentGeoNameId" type="int" stored="true"
indexed="true" />
- <field name="contentName" type="string" stored="true" indexed="true" />
- <field name="countryIsoCode" type="string" stored="true"
indexed="true"/>
- <field name="countryName" type="string" stored="true" indexed="true" />
- <field name="countryConfidence" type="int" stored="true"
indexed="true"/>
- <field name="countryGeoNameId" type="int" stored="true"
indexed="true"/>
- <field name="latLon" type="string" stored="true" indexed="true"/>
- <field name="accRadius" type="int" stored="true" indexed="true"/>
- <field name="timeZone" type="string" stored="true" indexed="true"/>
- <field name="metroCode" type="int" stored="true" indexed="true" />
- <field name="postalCode" type="string" stored="true" indexed="true" />
- <field name="postalConfidence" type="int" stored="true" indexed="true"
/>
- <field name="countryType" type="string" stored="true" indexed="true" />
- <field name="subDivName" type="string" stored="true" indexed="true" />
- <field name="subDivIsoCode" type="string" stored="true" indexed="true"
/>
- <field name="subDivConfidence" type="int" stored="true" indexed="true"
/>
- <field name="subDivGeoNameId" type="int" stored="true" indexed="true"
/>
- <field name="autonSystemNum" type="int" stored="true" indexed="true" />
- <field name="autonSystemOrg" type="string" stored="true"
indexed="true" />
- <field name="domain" type="string" stored="true" indexed="true" />
- <field name="isp" type="string" stored="true" indexed="true" />
- <field name="org" type="string" stored="true" indexed="true" />
- <field name="userType" type="string" stored="true" indexed="true" />
- <field name="isAnonProxy" type="boolean" stored="true" indexed="true"
/>
- <field name="isSatelitteProv" type="boolean" stored="true"
indexed="true" />
- <field name="connType" type="string" stored="true" indexed="true" />
-
-
-
- <dynamicField name="*_coordinate" type="tdouble" indexed="true"
stored="false"/>
-
- <!-- fields for index-anchor plugin -->
- <field name="anchor" type="string" stored="true" indexed="true"
- multiValued="true"/>
-
- <!-- fields for index-more plugin -->
- <field name="type" type="string" stored="true" indexed="true"
- multiValued="true"/>
- <field name="contentLength" type="long" stored="true"
- indexed="false"/>
- <field name="lastModified" type="date" stored="true"
- indexed="false"/>
- <field name="date" type="date" stored="true" indexed="true"/>
-
- <!-- fields for languageidentifier plugin -->
- <field name="lang" type="string" stored="true" indexed="true"/>
-
- <!-- fields for subcollection plugin -->
- <field name="subcollection" type="string" stored="true"
- indexed="true" multiValued="true"/>
-
- <!-- fields for feed plugin (tag is also used by
microformats-reltag)-->
- <field name="author" type="string" stored="true" indexed="true"/>
- <field name="tag" type="string" stored="true" indexed="true"
multiValued="true"/>
- <field name="feed" type="string" stored="true" indexed="true"/>
- <field name="publishedDate" type="date" stored="true"
- indexed="true"/>
- <field name="updatedDate" type="date" stored="true"
- indexed="true"/>
-
- <!-- fields for creativecommons plugin -->
- <field name="cc" type="string" stored="true" indexed="true"
- multiValued="true"/>
-
- <!-- fields for tld plugin -->
- <field name="tld" type="string" stored="false" indexed="false"/>
-
- <!-- field containing segment's raw binary content if indexed with
-addBinaryContent -->
-+ <field name="binaryContent" type="binary" stored="true"
indexed="false"/>
-
- <!-- to work with Solr 4.9 and beyond that use RealTimeGetHandler -->
- <field name="_version_" type="long" indexed="true" stored="true"/>
-
- </fields>
- <uniqueKey>id</uniqueKey>
- <defaultSearchField>content</defaultSearchField>
- <solrQueryParser defaultOperator="OR"/>
-
- <!-- copyField commands copy one field to another at the time a document
- is added to the index. It's used either to index the same field
differently,
- or to add multiple fields to the same field for easier/faster
searching.
- -->
- <copyField source="content" dest="text"/>
- <copyField source="url" dest="text"/>
- <copyField source="title" dest="text"/>
- <copyField source="anchor" dest="text"/>
- <copyField source="author" dest="text"/>
- <copyField source="latLon" dest="location"/>
+ -->
+
+ </types>
+ <fields>
+ <field name="id" type="string" stored="true" indexed="true"
required="true"/>
+ <field name="_version_" type="long" indexed="true" stored="true"/>
+
+ <!-- core fields -->
+ <field name="segment" type="string" stored="true" indexed="false"/>
+ <field name="digest" type="string" stored="true" indexed="false"/>
+ <field name="boost" type="float" stored="true" indexed="false"/>
+
+ <!-- fields for index-basic plugin -->
+ <field name="host" type="url" stored="false" indexed="true"/>
+ <field name="url" type="url" stored="true" indexed="true"/>
+ <!-- stored=true for highlighting, use term vectors and positions for
fast highlighting -->
+ <field name="content" type="text_general" stored="true" indexed="true"/>
+ <field name="title" type="text_general" stored="true" indexed="true"/>
+ <field name="cache" type="string" stored="true" indexed="false"/>
+ <field name="tstamp" type="date" stored="true" indexed="false"/>
+
+ <!-- fields for index-geoip plugin -->
+ <field name="ip" type="string" stored="true" indexed="true" />
+ <field name="cityName" type="string" stored="true" indexed="true" />
+ <field name="cityConfidence" type="int" stored="true" indexed="true" />
+ <field name="cityGeoNameId" type="int" stored="true" indexed="true" />
+ <field name="continentCode" type="string" stored="true" indexed="true" />
+ <field name="continentGeoNameId" type="int" stored="true" indexed="true" />
+ <field name="contentName" type="string" stored="true" indexed="true" />
+ <field name="countryIsoCode" type="string" stored="true" indexed="true"/>
+ <field name="countryName" type="string" stored="true" indexed="true" />
+ <field name="countryConfidence" type="int" stored="true" indexed="true"/>
+ <field name="countryGeoNameId" type="int" stored="true" indexed="true"/>
+ <field name="latLon" type="string" stored="true" indexed="true"/>
+ <field name="accRadius" type="int" stored="true" indexed="true"/>
+ <field name="timeZone" type="string" stored="true" indexed="true"/>
+ <field name="metroCode" type="int" stored="true" indexed="true" />
+ <field name="postalCode" type="string" stored="true" indexed="true" />
+ <field name="postalConfidence" type="int" stored="true" indexed="true" />
+ <field name="countryType" type="string" stored="true" indexed="true" />
+ <field name="subDivName" type="string" stored="true" indexed="true" />
+ <field name="subDivIsoCode" type="string" stored="true" indexed="true" />
+ <field name="subDivConfidence" type="int" stored="true" indexed="true" />
+ <field name="subDivGeoNameId" type="int" stored="true" indexed="true" />
+ <field name="autonSystemNum" type="int" stored="true" indexed="true" />
+ <field name="autonSystemOrg" type="string" stored="true" indexed="true" />
+ <field name="domain" type="string" stored="true" indexed="true" />
+ <field name="isp" type="string" stored="true" indexed="true" />
+ <field name="org" type="string" stored="true" indexed="true" />
+ <field name="userType" type="string" stored="true" indexed="true" />
+ <field name="isAnonProxy" type="boolean" stored="true" indexed="true" />
+ <field name="isSatelitteProv" type="boolean" stored="true" indexed="true"
/>
+ <field name="connType" type="string" stored="true" indexed="true" />
+ <field name="location" type="location" stored="true" indexed="true" />
+
+ <dynamicField name="*_coordinate" type="tdouble" indexed="true"
stored="false"/>
+
+ <!-- catch-all field -->
+ <field name="text" type="text_general" stored="false" indexed="true"
multiValued="true"/>
+
+ <!-- fields for index-anchor plugin -->
+ <field name="anchor" type="text_general" stored="true" indexed="true"
+ multiValued="true"/>
+
+ <!-- fields for index-more plugin -->
+ <field name="type" type="string" stored="true" indexed="true"
multiValued="true"/>
+ <field name="contentLength" type="string" stored="true" indexed="false"/>
+ <field name="lastModified" type="date" stored="true" indexed="false"/>
+ <field name="date" type="tdate" stored="true" indexed="true"/>
+
+ <!-- fields for languageidentifier plugin -->
+ <field name="lang" type="string" stored="true" indexed="true"/>
+
+ <!-- fields for subcollection plugin -->
+ <field name="subcollection" type="string" stored="true" indexed="true"
multiValued="true"/>
+
+ <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
+ <field name="author" type="string" stored="true" indexed="true"/>
+ <field name="tag" type="string" stored="true" indexed="true"
multiValued="true"/>
+ <field name="feed" type="string" stored="true" indexed="true"/>
+ <field name="publishedDate" type="date" stored="true" indexed="true"/>
+ <field name="updatedDate" type="date" stored="true" indexed="true"/>
+
+ <!-- fields for creativecommons plugin -->
+ <field name="cc" type="string" stored="true" indexed="true"
multiValued="true"/>
+
+ <!-- fields for tld plugin -->
+ <field name="tld" type="string" stored="false" indexed="false"/>
+
+ <!-- field containing segment's raw binary content if indexed with
-addBinaryContent -->
+ <field name="binaryContent" type="binary" stored="true" indexed="false"/>
+
+ </fields>
+ <uniqueKey>id</uniqueKey>
+ <defaultSearchField>text</defaultSearchField>
+ <solrQueryParser defaultOperator="OR"/>
+
+ <!-- copyField commands copy one field to another at the time a document
+ is added to the index. It's used either to index the same field
differently,
+ or to add multiple fields to the same field for easier/faster
searching. -->
+
+ <copyField source="content" dest="text"/>
+ <copyField source="url" dest="text"/>
+ <copyField source="title" dest="text"/>
+ <copyField source="anchor" dest="text"/>
+ <copyField source="author" dest="text"/>
+ <copyField source="latLon" dest="location"/>
</schema>
Modified: nutch/trunk/ivy/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Tue Aug 18 21:19:07 2015
@@ -34,25 +34,19 @@
</publications>
<dependencies>
- <dependency org="org.slf4j" name="slf4j-api" rev="1.6.1"
- conf="*->master" />
- <dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"
- conf="*->master" />
+ <dependency org="org.slf4j" name="slf4j-api" rev="1.6.1"
conf="*->master" />
+ <dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"
conf="*->master" />
<dependency org="log4j" name="log4j" rev="1.2.15"
conf="*->master" />
- <dependency org="commons-lang" name="commons-lang" rev="2.6"
- conf="*->default" />
- <dependency org="commons-collections" name="commons-collections"
- rev="3.1" conf="*->default" />
- <dependency org="commons-httpclient" name="commons-httpclient"
- rev="3.1" conf="*->master" />
- <dependency org="commons-codec" name="commons-codec" rev="1.3"
- conf="*->default" />
- <dependency org="org.apache.commons" name="commons-compress"
rev="1.9"
- conf="*->default" />
- <dependency org="org.apache.hadoop" name="hadoop-core"
rev="1.2.0"
- conf="*->default">
+ <dependency org="commons-lang" name="commons-lang" rev="2.6"
conf="*->default" />
+ <dependency org="commons-collections"
name="commons-collections" rev="3.1" conf="*->default" />
+ <dependency org="commons-httpclient" name="commons-httpclient"
rev="3.1" conf="*->master" />
+ <dependency org="commons-codec" name="commons-codec" rev="1.3"
conf="*->default" />
+ <dependency org="org.apache.commons" name="commons-compress" rev="1.9"
conf="*->default" />
+ <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
+
+ <dependency org="org.apache.hadoop" name="hadoop-core"
rev="1.2.0" conf="*->default">
<exclude org="hsqldb" name="hsqldb" />
<exclude org="net.sf.kosmosfs" name="kfs" />
<exclude org="net.java.dev.jets3t" name="jets3t" />
@@ -69,8 +63,9 @@
<dependency org="oro" name="oro" rev="2.0.8" />
<dependency org="com.google.guava" name="guava" rev="11.0.2" />
- <dependency org="com.google.code.crawler-commons"
name="crawler-commons"
- rev="0.5" />
+
+ <dependency org="com.github.crawler-commons"
name="crawler-commons" rev="0.6" />
+
<dependency org="org.apache.cxf" name="cxf" rev="3.0.4"/>
<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws"
rev="3.0.4"/>
<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs"
rev="3.0.4"/>
@@ -79,40 +74,20 @@
<dependency org="com.fasterxml.jackson.core" name="jackson-databind"
rev="2.5.1" />
<dependency org="com.fasterxml.jackson.dataformat"
name="jackson-dataformat-cbor" rev="2.5.1" />
<dependency org="com.fasterxml.jackson.jaxrs"
name="jackson-jaxrs-json-provider" rev="2.5.1" />
-
- <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
-
- <dependency org="org.apache.mahout" name="mahout-math" rev="0.8" />
- <dependency org="org.apache.mahout" name="mahout-core" rev="0.8" />
- <dependency org="org.apache.lucene" name="lucene-core" rev="4.3.0" />
- <dependency org="org.apache.lucene" name="lucene-analyzers-common"
rev="4.3.0" />
- <!--Configuration: test -->
-
<!--artifacts needed for testing -->
<dependency org="junit" name="junit" rev="4.11"
conf="test->default" />
-
- <dependency org="org.apache.hadoop" name="hadoop-test"
rev="1.2.0"
- conf="test->default" />
-
- <dependency org="org.mortbay.jetty" name="jetty-client"
- rev="6.1.22" conf="test->default" />
-
- <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22"
- conf="test->default" />
- <dependency org="org.mortbay.jetty" name="jetty-util"
rev="6.1.22"
- conf="test->default" />
-
- <!-- naive bayes parse filter -->
- <dependency org="org.apache.mahout.commons" name="commons-cli"
rev="2.0-mahout"
- conf="test->default" />
+ <dependency org="org.apache.hadoop" name="hadoop-test"
rev="1.2.0" conf="test->default" />
+ <dependency org="org.mortbay.jetty" name="jetty-client"
rev="6.1.22" conf="test->default" />
+ <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22"
conf="test->default" />
+ <dependency org="org.mortbay.jetty" name="jetty-util"
rev="6.1.22" conf="test->default" />
+ <!-- end of test artifacts -->
<!--global exclusion -->
<exclude module="jmxtools" />
<exclude module="jms" />
<exclude module="jmxri" />
- <exclude org="com.thoughtworks.xstream"/>
- <exclude org="org.apache.mrunit"/>
+ <exclude org="com.thoughtworks.xstream"/>
</dependencies>
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Tue Aug
18 21:19:07 2015
@@ -19,7 +19,6 @@ package org.apache.nutch.indexer;
// Hadoop imports
import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
// Nutch imports
Copied: nutch/trunk/src/plugin/index-geoip/build-ivy.xml (from r1693938,
nutch/trunk/src/plugin/parse-tika/build-ivy.xml)
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/build-ivy.xml?p2=nutch/trunk/src/plugin/index-geoip/build-ivy.xml&p1=nutch/trunk/src/plugin/parse-tika/build-ivy.xml&r1=1693938&r2=1696506&rev=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/build-ivy.xml (original)
+++ nutch/trunk/src/plugin/index-geoip/build-ivy.xml Tue Aug 18 21:19:07 2015
@@ -15,7 +15,7 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
-<project name="parse-tika" default="deps-jar"
xmlns:ivy="antlib:org.apache.ivy.ant">
+<project name="index-geoip" default="deps-jar"
xmlns:ivy="antlib:org.apache.ivy.ant">
<property name="ivy.install.version" value="2.1.0" />
<condition property="ivy.home" value="${env.IVY_HOME}">
Modified: nutch/trunk/src/plugin/index-geoip/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/ivy.xml (original)
+++ nutch/trunk/src/plugin/index-geoip/ivy.xml Tue Aug 18 21:19:07 2015
@@ -36,7 +36,11 @@
</publications>
<dependencies>
- <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.1.0" />
+ <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.3.1" >
+ <!-- Exlude due to classpath issues -->
+ <exclude org="org.apache.httpcomponents" name="httpclient" />
+ <exclude org="org.apache.httpcomponents" name="httpcore" />
+ </dependency>
</dependencies>
</ivy-module>
Modified: nutch/trunk/src/plugin/index-geoip/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/plugin.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/plugin.xml (original)
+++ nutch/trunk/src/plugin/index-geoip/plugin.xml Tue Aug 18 21:19:07 2015
@@ -25,15 +25,13 @@
<library name="index-geoip.jar">
<export name="*"/>
</library>
- <library name="geoip2-2.1.0.jar"/>
<library name="commons-codec-1.6.jar"/>
<library name="commons-logging-1.1.1.jar"/>
- <library name="google-http-client-1.19.0.jar"/>
- <library name="httpclient-4.0.1.jar"/>
- <library name="httpcore-4.0.1.jar"/>
- <library name="jackson-annotations-2.4.0.jar"/>
- <library name="jackson-core-2.4.3.jar"/>
- <library name="jackson-databind-2.4.3.jar"/>
+ <library name="geoip2-2.3.1.jar"/>
+ <library name="google-http-client-1.20.0.jar"/>
+ <library name="jackson-annotations-2.5.0.jar"/>
+ <library name="jackson-core-2.5.3.jar"/>
+ <library name="jackson-databind-2.5.3.jar"/>
<library name="jsr305-1.3.9.jar"/>
<library name="maxmind-db-1.0.0.jar"/>
</runtime>
Added: nutch/trunk/src/plugin/indexer-solr/build-ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/build-ivy.xml?rev=1696506&view=auto
==============================================================================
--- nutch/trunk/src/plugin/indexer-solr/build-ivy.xml (added)
+++ nutch/trunk/src/plugin/indexer-solr/build-ivy.xml Tue Aug 18 21:19:07 2015
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-solr" default="deps-jar"
xmlns:ivy="antlib:org.apache.ivy.ant">
+
+ <property name="ivy.install.version" value="2.1.0" />
+ <condition property="ivy.home" value="${env.IVY_HOME}">
+ <isset property="env.IVY_HOME" />
+ </condition>
+ <property name="ivy.home" value="${user.home}/.ant" />
+ <property name="ivy.checksums" value="" />
+ <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+ <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+ <target name="download-ivy" unless="offline">
+
+ <mkdir dir="${ivy.jar.dir}"/>
+ <!-- download Ivy from web site so that it can be used even without
any special installation -->
+ <get
src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
+ dest="${ivy.jar.file}" usetimestamp="true"/>
+ </target>
+
+ <target name="init-ivy" depends="download-ivy">
+ <!-- try to load ivy here from ivy home, in case the user has not
already dropped
+ it into ant's lib dir (note that the latter copy will always
take precedence).
+ We will not fail as long as local lib dir exists (it may be
empty) and
+ ivy is in at least one of ant's lib dir or the local lib dir. -->
+ <path id="ivy.lib.path">
+ <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+ </path>
+ <taskdef resource="org/apache/ivy/ant/antlib.xml"
+ uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+ </target>
+
+ <target name="deps-jar" depends="init-ivy">
+ <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+ </target>
+
+</project>
Modified: nutch/trunk/src/plugin/indexer-solr/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/indexer-solr/ivy.xml (original)
+++ nutch/trunk/src/plugin/indexer-solr/ivy.xml Tue Aug 18 21:19:07 2015
@@ -36,8 +36,9 @@
</publications>
<dependencies>
- <dependency org="org.apache.solr" name="solr-solrj" rev="3.4.0"
- conf="*->default"/>
+ <dependency org="org.apache.solr" name="solr-solrj" rev="4.10.2"
conf="*->default"/>
+ <dependency org="org.apache.httpcomponents" name="httpclient" rev="4.3.1"
conf="*->default"/>
+ <dependency org="org.apache.httpcomponents" name="httpmime" rev="4.3.1"
conf="*->default"/>
</dependencies>
</ivy-module>
Modified: nutch/trunk/src/plugin/indexer-solr/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/plugin.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/indexer-solr/plugin.xml (original)
+++ nutch/trunk/src/plugin/indexer-solr/plugin.xml Tue Aug 18 21:19:07 2015
@@ -15,29 +15,24 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
-<plugin id="indexer-solr" name="SOLRIndexWriter" version="1.0.0"
+<plugin id="indexer-solr" name="SolrIndexWriter" version="1.0.0"
provider-name="nutch.apache.org">
<runtime>
<library name="indexer-solr.jar">
<export name="*" />
</library>
-
- <library name="activation-1.1.jar"/>
- <library name="commons-codec-1.4.jar"/>
- <library name="commons-httpclient-3.1.jar"/>
- <library name="commons-io-1.4.jar"/>
- <library name="commons-logging-1.1.1.jar"/>
- <library name="geronimo-stax-api_1.0_spec-1.0.1.jar"/>
- <library name="jline-0.9.1.jar"/>
- <library name="log4j-1.2.15.jar"/>
- <library name="lucene-core-3.4.0.jar"/>
- <library name="mail-1.4.1.jar"/>
- <library name="slf4j-api-1.6.1.jar"/>
- <library name="solr-solrj-3.4.0.jar"/>
- <library name="stax-api-1.0.1.jar"/>
- <library name="wstx-asl-3.2.7.jar"/>
- <library name="zookeeper-3.3.1.jar"/>
+ <library name="commons-codec-1.9.jar"/>
+ <library name="commons-io-2.3.jar"/>
+ <library name="commons-logging-1.1.3.jar"/>
+ <library name="httpclient-4.3.1.jar"/>
+ <library name="httpcore-4.3.jar"/>
+ <library name="httpmime-4.3.1.jar"/>
+ <library name="noggit-0.5.jar"/>
+ <library name="slf4j-api-1.7.6.jar"/>
+ <library name="solr-solrj-4.10.2.jar"/>
+ <library name="wstx-asl-3.2.7.jar"/>
+ <library name="zookeeper-3.4.6.jar"/>
</runtime>
<requires>
@@ -45,9 +40,9 @@
</requires>
<extension id="org.apache.nutch.indexer.solr"
- name="SOLR Index Writer"
+ name="Solr Index Writer"
point="org.apache.nutch.indexer.IndexWriter">
- <implementation id="SOLRIndexWriter"
+ <implementation id="SolrIndexWriter"
class="org.apache.nutch.indexwriter.solr.SolrIndexWriter" />
</extension>
Modified:
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
---
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
(original)
+++
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
Tue Aug 18 21:19:07 2015
@@ -17,6 +17,7 @@
package org.apache.nutch.indexwriter.solr;
public interface SolrConstants {
+
public static final String SOLR_PREFIX = "solr.";
public static final String SERVER_URL = SOLR_PREFIX + "server.url";
@@ -30,7 +31,13 @@ public interface SolrConstants {
public static final String USERNAME = SOLR_PREFIX + "auth.username";
public static final String PASSWORD = SOLR_PREFIX + "auth.password";
-
+
+ public static final String SERVER_TYPE = SOLR_PREFIX + "server.type";
+
+ public static final String ZOOKEEPER_URL = SOLR_PREFIX + "zookeeper.url";
+
+ public static final String LOADBALANCE_URLS = SOLR_PREFIX +
"loadbalance.urls";
+
@Deprecated
public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index";
Modified:
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
---
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
(original)
+++
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
Tue Aug 18 21:19:07 2015
@@ -55,7 +55,7 @@ public class SolrIndexWriter implements
private boolean delete = false;
public void open(JobConf job, String name) throws IOException {
- SolrServer server = SolrUtils.getCommonsHttpSolrServer(job);
+ SolrServer server = SolrUtils.getSolrServer(job);
init(server, job);
}
@@ -183,7 +183,7 @@ public class SolrIndexWriter implements
config = conf;
String serverURL = conf.get(SolrConstants.SERVER_URL);
if (serverURL == null) {
- String message = "Missing SOLR URL. Should be set via -D "
+ String message = "Missing Solr URL. Should be set via -D "
+ SolrConstants.SERVER_URL;
message += "\n" + describe();
LOG.error(message);
@@ -192,15 +192,20 @@ public class SolrIndexWriter implements
}
public String describe() {
- StringBuffer sb = new StringBuffer("SOLRIndexWriter\n");
+ StringBuffer sb = new StringBuffer("SolrIndexWriter\n");
+ sb.append("\t").append(SolrConstants.SERVER_TYPE)
+ .append(" : Type of SolrServer to communicate with (default 'http'
however options include 'cloud', 'lb' and 'concurrent')\n");
sb.append("\t").append(SolrConstants.SERVER_URL)
- .append(" : URL of the SOLR instance (mandatory)\n");
- sb.append("\t").append(SolrConstants.COMMIT_SIZE)
- .append(" : buffer size when sending to SOLR (default 1000)\n");
+ .append(" : URL of the Solr instance (mandatory)\n");
+ sb.append("\t").append(SolrConstants.ZOOKEEPER_URL)
+ .append(" : URL of the Zookeeper URL (mandatory if 'cloud' value for
solr.server.type)\n");
+ sb.append("\t").append(SolrConstants.LOADBALANCE_URLS)
+ .append(" : Comma-separated string of Solr server strings to be used
(madatory if 'lb' value for solr.server.type)\n");
sb.append("\t")
.append(SolrConstants.MAPPING_FILE)
- .append(
- " : name of the mapping file for fields (default
solrindex-mapping.xml)\n");
+ .append(" : name of the mapping file for fields (default
solrindex-mapping.xml)\n");
+ sb.append("\t").append(SolrConstants.COMMIT_SIZE)
+ .append(" : buffer size when sending to Solr (default 1000)\n");
sb.append("\t").append(SolrConstants.USE_AUTH)
.append(" : use authentication (default false)\n");
sb.append("\t").append(SolrConstants.USERNAME)
@@ -209,5 +214,4 @@ public class SolrIndexWriter implements
.append(" : password for authentication\n");
return sb.toString();
}
-
}
Modified:
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
---
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
(original)
+++
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
Tue Aug 18 21:19:07 2015
@@ -16,14 +16,20 @@
*/
package org.apache.nutch.indexwriter.solr;
-import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.auth.AuthScope;
-import org.apache.commons.httpclient.UsernamePasswordCredentials;
-import org.apache.commons.httpclient.params.HttpClientParams;
+import org.apache.http.impl.client.BasicCredentialsProvider;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.auth.AuthScope;
+import org.apache.http.auth.UsernamePasswordCredentials;
+import org.apache.http.client.CredentialsProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.mapred.JobConf;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+import org.apache.solr.client.solrj.impl.CloudSolrServer;
+import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrServer;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
+import org.apache.solr.client.solrj.impl.LBHttpSolrServer;
+import org.apache.solr.client.solrj.SolrServer;
import java.net.MalformedURLException;
@@ -31,33 +37,62 @@ public class SolrUtils {
public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class);
- public static CommonsHttpSolrServer getCommonsHttpSolrServer(JobConf job)
+ private static SolrServer server;
+
+ public static SolrServer getSolrServer(JobConf job)
throws MalformedURLException {
- HttpClient client = new HttpClient();
+ boolean auth = job.getBoolean(SolrConstants.USE_AUTH, false);
+
+ CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
// Check for username/password
- if (job.getBoolean(SolrConstants.USE_AUTH, false)) {
+ if (auth) {
String username = job.get(SolrConstants.USERNAME);
-
LOG.info("Authenticating as: " + username);
-
AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT,
AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
-
- client.getState().setCredentials(
- scope,
- new UsernamePasswordCredentials(username, job
- .get(SolrConstants.PASSWORD)));
-
- HttpClientParams params = client.getParams();
- params.setAuthenticationPreemptive(true);
-
- client.setParams(params);
+ credentialsProvider.setCredentials(scope,
+ new UsernamePasswordCredentials(username,
job.get(SolrConstants.PASSWORD)));
}
+ CloseableHttpClient client =
+
HttpClientBuilder.create().setDefaultCredentialsProvider(credentialsProvider).build();
- String serverURL = job.get(SolrConstants.SERVER_URL);
-
- return new CommonsHttpSolrServer(serverURL, client);
+ String solrServer = job.get(SolrConstants.SERVER_TYPE, "http");
+ String zkHost = job.get(SolrConstants.ZOOKEEPER_URL, null);
+ String solrServerUrl = job.get(SolrConstants.SERVER_URL);
+
+ switch (solrServer) {
+ case "cloud":
+ server = new CloudSolrServer(zkHost);
+ LOG.debug("CloudSolrServer selected as indexing server.");
+ break;
+ case "concurrent":
+ server = new ConcurrentUpdateSolrServer(solrServerUrl, client, 1000, 10);
+ LOG.debug("ConcurrentUpdateSolrServer selected as indexing server.");
+ break;
+ case "http":
+ if (auth) {
+ server = new HttpSolrServer(solrServerUrl, client);
+ } else {
+ server = new HttpSolrServer(solrServerUrl);
+ }
+ LOG.debug("HttpSolrServer selected as indexing server.");
+ break;
+ case "lb":
+ String[] lbServerString =
job.get(SolrConstants.LOADBALANCE_URLS).split(",");
+ server = new LBHttpSolrServer(client, lbServerString);
+ LOG.debug("LBHttpSolrServer selected as indexing server.");
+ break;
+ default:
+ if (auth) {
+ server = new HttpSolrServer(solrServerUrl, client);
+ } else {
+ server = new HttpSolrServer(solrServerUrl);
+ }
+ LOG.debug("HttpSolrServer selected as indexing server.");
+ break;
+ }
+ return server;
}
public static String stripNonCharCodepoints(String input) {
@@ -82,4 +117,4 @@ public class SolrUtils {
return retval.toString();
}
-}
+}
\ No newline at end of file
Modified: nutch/trunk/src/plugin/parse-tika/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/ivy.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/ivy.xml Tue Aug 18 21:19:07 2015
@@ -38,6 +38,8 @@
<dependencies>
<dependency org="org.apache.tika" name="tika-parsers" rev="1.8"
conf="*->default">
<exclude org="org.apache.tika" name="tika-core" />
+ <exclude org="org.apache.httpcomponents" name="httpclient" />
+ <exclude org="org.apache.httpcomponents" name="httpcore" />
</dependency>
<override module="rome" rev="0.9"/>
</dependencies>
Copied: nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml (from
r1693468, nutch/trunk/src/plugin/parse-tika/build-ivy.xml)
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml?p2=nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml&p1=nutch/trunk/src/plugin/parse-tika/build-ivy.xml&r1=1693468&r2=1696506&rev=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/build-ivy.xml (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml Tue Aug 18
21:19:07 2015
@@ -15,7 +15,7 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
-<project name="parse-tika" default="deps-jar"
xmlns:ivy="antlib:org.apache.ivy.ant">
+<project name="parsefilter-naivebayes" default="deps-jar"
xmlns:ivy="antlib:org.apache.ivy.ant">
<property name="ivy.install.version" value="2.1.0" />
<condition property="ivy.home" value="${env.IVY_HOME}">
Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml Tue Aug 18 21:19:07
2015
@@ -36,6 +36,14 @@
</publications>
<dependencies>
+
+ <dependency org="org.apache.mahout" name="mahout-math" rev="0.10.1" />
+ <dependency org="org.apache.mahout" name="mahout-core" rev="0.9" >
+ <exclude org="org.apache.mrunit" name="mrunit"/>
+ </dependency>
+ <dependency org="org.apache.lucene" name="lucene-core" rev="4.10.2" />
+ <dependency org="org.apache.lucene" name="lucene-analyzers-common"
rev="4.10.2" />
+
</dependencies>
</ivy-module>
Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml Tue Aug 18
21:19:07 2015
@@ -25,10 +25,22 @@
<library name="parsefilter-naivebayes.jar">
<export name="*"/>
</library>
- <library name="lucene-analyzers-common-4.3.0.jar"/>
- <library name="mahout-math-0.8.jar"/>
- <library name="mahout-core-0.8.jar"/>
- <library name="lucene-core-4.3.0.jar"/>
+ <library name="commons-cli-2.0-mahout.jar"/>
+ <library name="commons-lang3-3.1.jar"/>
+ <library name="commons-math3-3.2.jar"/>
+ <library name="guava-14.0.1.jar"/>
+ <library name="jackson-core-asl-1.9.12.jar"/>
+ <library name="jackson-mapper-asl-1.9.12.jar"/>
+ <library name="lucene-analyzers-common-4.10.2.jar"/>
+ <library name="lucene-core-4.10.2.jar"/>
+ <library name="mahout-core-0.9.jar"/>
+ <library name="mahout-math-0.10.1.jar"/>
+ <library name="slf4j-api-1.7.12.jar"/>
+ <library name="solr-commons-csv-3.5.0.jar"/>
+ <library name="t-digest-3.1.jar"/>
+ <library name="xmlpull-1.1.3.1.jar"/>
+ <library name="xpp3_min-1.1.4c.jar"/>
+ <library name="xstream-1.4.4.jar"/>
</runtime>
<requires>
@@ -36,10 +48,9 @@
</requires>
<extension id="org.apache.nutch.htmlparsefilter.naivebayes"
- name="Nutch Parser Filter"
- point="org.apache.nutch.parse.HtmlParseFilter">
- <implementation id="NaiveBayesHTMLParseFilter"
-
class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/>
+ name="Nutch Parser Filter"
point="org.apache.nutch.parse.HtmlParseFilter">
+ <implementation id="NaiveBayesHTMLParseFilter"
+ class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/>
</extension>
</plugin>
Modified:
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
---
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
(original)
+++
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
Tue Aug 18 21:19:07 2015
@@ -35,7 +35,6 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.Version;
import org.apache.mahout.classifier.naivebayes.BayesUtils;
import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier;
@@ -47,17 +46,12 @@ import org.apache.mahout.math.Vector;
import org.apache.mahout.math.Vector.Element;
import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles;
import org.apache.mahout.vectorizer.TFIDF;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.Multiset;
public class NaiveBayesClassifier {
private static NaiveBayesModel model = null;
- private static final Logger LOG = LoggerFactory
- .getLogger(NaiveBayesClassifier.class);
public static Map<String, Integer> readDictionnary(Configuration conf,
Path dictionnaryPath) {
@@ -114,7 +108,7 @@ public class NaiveBayesClassifier {
public static String classify(String text, String modelPath,
String labelIndexPath, String dictionaryPath, String
documentFrequencyPath)
- throws IOException {
+ throws IOException {
Configuration configuration = new Configuration();
@@ -134,7 +128,7 @@ public class NaiveBayesClassifier {
new Path(documentFrequencyPath));
// analyzer used to extract word from text
- Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
+ Analyzer analyzer = new StandardAnalyzer();
// int labelCount = labels.size();
int documentCount = documentFrequency.get(-1).intValue();