Author: snagel Date: Tue Jul 29 15:13:20 2014 New Revision: 1614375 URL: http://svn.apache.org/r1614375 Log: NUTCH-1708 use same id when indexing and deleting redirects
Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/conf/schema.xml nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java nutch/trunk/CHANGES.txt nutch/trunk/conf/schema-solr4.xml nutch/trunk/conf/schema.xml nutch/trunk/conf/solrindex-mapping.xml nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1614375&r1=1614374&r2=1614375&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Tue Jul 29 15:13:20 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1708 use same id when indexing and deleting redirects (snagel) + * NUTCH-1817 Remove pom.xml from source (jnioche) * NUTCH-1811 bin/nutch junit to use junit 4 test runner (snagel) Modified: nutch/branches/2.x/conf/schema.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1614375&r1=1614374&r2=1614375&view=diff ============================================================================== --- nutch/branches/2.x/conf/schema.xml (original) +++ nutch/branches/2.x/conf/schema.xml Tue Jul 29 15:13:20 2014 @@ -307,7 +307,7 @@ to include it as performance improvements are minimal. --> <field name="_version_" type="long" indexed="true" stored="true"/> - <field name="id" type="string" stored="true" indexed="true"/> + <field name="id" type="string" stored="true" indexed="true" required="true"/> <!-- core fields --> <field name="batchId" type="string" stored="true" indexed="false"/> @@ -316,7 +316,7 @@ <!-- fields for index-basic plugin --> <field name="host" type="url" stored="false" indexed="true"/> - <field name="url" type="url" stored="true" indexed="true" required="true"/> + <field name="url" type="url" stored="true" indexed="true"/> <field name="orig" type="url" stored="true" indexed="true" /> <!-- stored=true for highlighting, use term vectors and positions for fast highlighting --> <field name="content" type="text_general" stored="true" indexed="true"/> Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1614375&r1=1614374&r2=1614375&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Tue Jul 29 15:13:20 2014 @@ -123,6 +123,7 @@ public class IndexingFiltersChecker exte } NutchDocument doc = new NutchDocument(); + doc.add("id", url); doc.add("digest", StringUtil.toHexString(page.getSignature())); try { Modified: nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java?rev=1614375&r1=1614374&r2=1614375&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java (original) +++ nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java Tue Jul 29 15:13:20 2014 @@ -121,7 +121,7 @@ public class ElasticIndexWriter implemen @Override public void write(NutchDocument doc) throws IOException { - String id = (String) doc.getFieldValue("url"); + String id = (String) doc.getFieldValue("id"); String type = doc.getDocumentMeta().get("type"); if (type == null) type = "doc"; Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1614375&r1=1614374&r2=1614375&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Jul 29 15:13:20 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1708 use same id when indexing and deleting redirects (snagel) + * NUTCH-1818 Add deps-test-compile task for building plugins (jnioche) * NUTCH-1817 Remove pom.xml from source (jnioche) Modified: nutch/trunk/conf/schema-solr4.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema-solr4.xml?rev=1614375&r1=1614374&r2=1614375&view=diff ============================================================================== --- nutch/trunk/conf/schema-solr4.xml (original) +++ nutch/trunk/conf/schema-solr4.xml Tue Jul 29 15:13:20 2014 @@ -301,7 +301,7 @@ </types> <fields> - <field name="id" type="string" stored="true" indexed="true"/> + <field name="id" type="string" stored="true" indexed="true" required="true"/> <!-- core fields --> <field name="segment" type="string" stored="true" indexed="false"/> @@ -310,7 +310,7 @@ <!-- fields for index-basic plugin --> <field name="host" type="url" stored="false" indexed="true"/> - <field name="url" type="url" stored="true" indexed="true" required="true"/> + <field name="url" type="url" stored="true" indexed="true"/> <!-- stored=true for highlighting, use term vectors and positions for fast highlighting --> <field name="content" type="text_general" stored="true" indexed="true"/> <field name="title" type="text_general" stored="true" indexed="true"/> Modified: nutch/trunk/conf/schema.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1614375&r1=1614374&r2=1614375&view=diff ============================================================================== --- nutch/trunk/conf/schema.xml (original) +++ nutch/trunk/conf/schema.xml Tue Jul 29 15:13:20 2014 @@ -66,7 +66,8 @@ </fieldType> </types> <fields> - <field name="id" type="string" stored="true" indexed="true"/> + <field name="id" type="string" stored="true" indexed="true" + required="true"/> <!-- core fields --> <field name="segment" type="string" stored="true" indexed="false"/> @@ -75,8 +76,7 @@ <!-- fields for index-basic plugin --> <field name="host" type="string" stored="false" indexed="true"/> - <field name="url" type="url" stored="true" indexed="true" - required="true"/> + <field name="url" type="url" stored="true" indexed="true"/> <field name="content" type="text" stored="false" indexed="true"/> <field name="title" type="text" stored="true" indexed="true"/> <field name="cache" type="string" stored="true" indexed="false"/> Modified: nutch/trunk/conf/solrindex-mapping.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/solrindex-mapping.xml?rev=1614375&r1=1614374&r2=1614375&view=diff ============================================================================== --- nutch/trunk/conf/solrindex-mapping.xml (original) +++ nutch/trunk/conf/solrindex-mapping.xml Tue Jul 29 15:13:20 2014 @@ -38,8 +38,6 @@ <field dest="boost" source="boost"/> <field dest="digest" source="digest"/> <field dest="tstamp" source="tstamp"/> - <field dest="id" source="url"/> - <copyField source="url" dest="url"/> </fields> <uniqueKey>id</uniqueKey> </mapping> Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1614375&r1=1614374&r2=1614375&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Jul 29 15:13:20 2014 @@ -262,6 +262,8 @@ implements Mapper<Text, Writable, Text, } NutchDocument doc = new NutchDocument(); + doc.add("id", key.toString()); + final Metadata metadata = parseData.getContentMeta(); // add segment, used to map from merged index back to segment files Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1614375&r1=1614374&r2=1614375&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Tue Jul 29 15:13:20 2014 @@ -130,6 +130,7 @@ public class IndexingFiltersChecker exte ParseResult parseResult = new ParseUtil(conf).parse(content); NutchDocument doc = new NutchDocument(); + doc.add("id", url); Text urlText = new Text(url); Inlinks inlinks = null; Modified: nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java?rev=1614375&r1=1614374&r2=1614375&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java (original) +++ nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java Tue Jul 29 15:13:20 2014 @@ -53,12 +53,12 @@ public class DummyIndexWriter implements @Override public void update(NutchDocument doc) throws IOException { - writer.write("update\t" + doc.getFieldValue("url") + "\n"); + writer.write("update\t" + doc.getFieldValue("id") + "\n"); } @Override public void write(NutchDocument doc) throws IOException { - writer.write("add\t" + doc.getFieldValue("url") + "\n"); + writer.write("add\t" + doc.getFieldValue("id") + "\n"); } public void close() throws IOException { Modified: nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java?rev=1614375&r1=1614374&r2=1614375&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java (original) +++ nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java Tue Jul 29 15:13:20 2014 @@ -123,7 +123,7 @@ public class ElasticIndexWriter implemen @Override public void write(NutchDocument doc) throws IOException { - String id = (String) doc.getFieldValue("url"); + String id = (String) doc.getFieldValue("id"); String type = doc.getDocumentMeta().get("type"); if (type == null) type = "doc";