Author: lewismc
Date: Sat Nov  1 18:44:26 2014
New Revision: 1636010

URL: http://svn.apache.org/r1636010
Log:
remove field orig which duplicates 'id'

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/schema.xml
    
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
    
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1636010&r1=1636009&r2=1636010&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Nov  1 18:44:26 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.3-SNAPSHOT
 
+* NUTCH-1820 remove field "orig" which duplicates "id" (lewismc, snagel)
+
 * NUTCH-1843 Upgrade to Gora 0.5 (talat, lewismc, Kiril Menshikov, drazzib)
 
 * NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value 
(snagel)

Modified: nutch/branches/2.x/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1636010&r1=1636009&r2=1636010&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Sat Nov  1 18:44:26 2014
@@ -317,7 +317,6 @@
     <!-- fields for index-basic plugin -->
     <field name="host" type="url" stored="false" indexed="true"/>
     <field name="url" type="url" stored="true" indexed="true"/>
-    <field name="orig" type="url" stored="true" indexed="true" />
     <!-- stored=true for highlighting, use term vectors  and positions for 
fast highlighting -->
     <field name="content" type="text_general" stored="true" indexed="true"/>
     <field name="title" type="text_general" stored="true" indexed="true" 
multiValued="true"/>

Modified: 
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1636010&r1=1636009&r2=1636010&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 Sat Nov  1 18:44:26 2014
@@ -40,7 +40,6 @@ import java.util.HashSet;
  * host - add host as un-stored, indexed and tokenized
  * url - url is both stored and indexed, so it's both searchable and returned. 
  * This is also a required field.
- * orig - also store original url as both stored and indexed
  * content - content is indexed, so that it's searchable, but not stored in 
index
  * title - title is stored and indexed
  * cache - add cached content/summary display policy, if available
@@ -99,11 +98,6 @@ public class BasicIndexingFilter impleme
     // url is both stored and indexed, so it's both searchable and returned
     doc.add("url", reprUrl == null ? url : reprUrl);
 
-    if (reprUrl != null) {
-      // also store original url as both stored and indexed
-      doc.add("orig", url);
-    }
-
     // content is indexed, so that it's searchable, but not stored in index
     doc.add("content", TableUtil.toString(page.getText()));
 

Modified: 
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1636010&r1=1636009&r2=1636010&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
 Sat Nov  1 18:44:26 2014
@@ -30,7 +30,7 @@ import static org.junit.Assert.*;
 
 /**
  * JUnit test case which tests
- * 1. that the host, url, orig, content, title, cache and tstamp fields 
+ * 1. that the host, url, content, title, cache and tstamp fields 
  * are obtained by the filter.
  * 2. that configurable maximum length functionality for titles actually 
works. .
  * This property defaults at 100 characters @see {@code 
indexer.max.title.length} 
@@ -65,7 +65,6 @@ public class TestBasicIndexingFilter {
        assertNotNull(doc);
        assertTrue("check for host field ", 
doc.getFieldNames().contains("host"));
        assertTrue("check for url field", doc.getFieldNames().contains("url"));
-       assertTrue("check for orig field", 
doc.getFieldNames().contains("orig"));
        assertTrue("check for content field", 
doc.getFieldNames().contains("content"));
        assertTrue("check for title field", 
doc.getFieldNames().contains("title"));
        assertTrue("check for cache field", 
doc.getFieldNames().contains("cache"));


Reply via email to