Author: fenglu
Date: Tue Mar 26 14:09:27 2013
New Revision: 1461140

URL: http://svn.apache.org/r1461140
Log:
NUTCH-1532 Replace 'segment' mapping field with batchId

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/schema-solr4.xml
    nutch/branches/2.x/conf/schema.xml
    nutch/branches/2.x/conf/solrindex-mapping.xml
    nutch/branches/2.x/src/bin/nutch
    
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java
    nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java
    nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java
    nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
    nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java
    nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
    nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
    
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
    nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Mar 26 14:09:27 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-1532 Replace 'segment' mapping field with batchId (Feng +via lewismc)
+
 * NUTCH-1533 Implement getPrevModifiedTime(), setPrevModifiedTime(), 
getBatchId() and setBatchId() accessors in o.a.n.storage.WebPage (Feng via 
lewismc)
 
 * NUTCH-XX fix Elastic Search Ivy configuration (Binoy d via lewismc) 

Modified: nutch/branches/2.x/conf/schema-solr4.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema-solr4.xml?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema-solr4.xml (original)
+++ nutch/branches/2.x/conf/schema-solr4.xml Tue Mar 26 14:09:27 2013
@@ -304,7 +304,7 @@
     <field name="id" type="string" stored="true" indexed="true"/>
 
     <!-- core fields -->
-    <field name="segment" type="string" stored="true" indexed="false"/>
+    <field name="batchId" type="string" stored="true" indexed="false"/>
     <field name="digest" type="string" stored="true" indexed="false"/>
     <field name="boost" type="float" stored="true" indexed="false"/>
 

Modified: nutch/branches/2.x/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Tue Mar 26 14:09:27 2013
@@ -69,7 +69,7 @@
         <field name="id" type="string" stored="true" indexed="true"/>
 
         <!-- core fields -->
-        <field name="segment" type="string" stored="true" indexed="false"/>
+        <field name="batchId" type="string" stored="true" indexed="false"/>
         <field name="digest" type="string" stored="true" indexed="false"/>
         <field name="boost" type="float" stored="true" indexed="false"/>
 

Modified: nutch/branches/2.x/conf/solrindex-mapping.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/solrindex-mapping.xml?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/conf/solrindex-mapping.xml (original)
+++ nutch/branches/2.x/conf/solrindex-mapping.xml Tue Mar 26 14:09:27 2013
@@ -34,7 +34,7 @@
                <field dest="content" source="content"/>
                <field dest="title" source="title"/>
                <field dest="host" source="host"/>
-               <field dest="segment" source="segment"/>
+               <field dest="batchId" source="batchId"/>
                <field dest="boost" source="boost"/>
                <field dest="digest" source="digest"/>
                <field dest="tstamp" source="tstamp"/>

Modified: nutch/branches/2.x/src/bin/nutch
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/nutch?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/nutch (original)
+++ nutch/branches/2.x/src/bin/nutch Tue Mar 26 14:09:27 2013
@@ -50,7 +50,7 @@ if [ $# = 0 ]; then
 # echo " crawl one-step crawler for intranets"
   echo " inject                inject new urls into the database"
   echo " hostinject     creates or updates an existing host table from a text 
file"
-  echo " generate      generate new segments to fetch from crawl db"
+  echo " generate      generate new batches to fetch from crawl db"
   echo " fetch                 fetch URLs marked during generate"
   echo " parse                 parse URLs marked during fetch"
   echo " updatedb      update web table after parsing"
@@ -58,7 +58,7 @@ if [ $# = 0 ]; then
   echo " readdb        read/dump records from page database"
   echo " readhostdb     display entries from the hostDB"
   echo " elasticindex   run the elasticsearch indexer"
-  echo " solrindex     run the solr indexer on parsed segments and linkdb"
+  echo " solrindex     run the solr indexer on parsed batches"
   echo " solrdedup     remove duplicates from solr"
   echo " parsechecker   check the parser for a given url"
   echo " plugin        load a plugin and run one of its classes main()"

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java 
Tue Mar 26 14:09:27 2013
@@ -169,7 +169,7 @@ implements FetchSchedule {
   public boolean shouldFetch(String url, WebPage page, long curTime) {
     // pages are never truly GONE - we have to check them from time to time.
     // pages with too long fetchInterval are adjusted so that they fit within
-    // maximum fetchInterval (segment retention period).
+    // maximum fetchInterval (batch retention period).
     long fetchTime = page.getFetchTime();
     if (fetchTime - curTime > maxInterval * 1000L) {
       if (page.getFetchInterval() > maxInterval) {

Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java Tue Mar 
26 14:09:27 2013
@@ -43,17 +43,28 @@ public class IndexUtil {
   }
   
   /**
-   * Index a webpage.
+   * Index a {@link Webpage}, here we add the following fields:
+   * <ol>
+   * <li><tt>id</tt>: default uniqueKey for the {@link NutchDocument}.</li>
+   * <li><tt>digest</tt>: Digest is used to identify pages (like unique ID) 
and is used to remove
+   * duplicates during the dedup procedure. It is calculated using {@link 
org.apache.nutch.crawl.MD5Signature} or
+   * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
+   * <li><tt>batchId</tt>: The page belongs to a unique batchId, this is its 
identifier.</li>
+   * <li><tt>boost</tt>: Boost is used to calculate document (field) score 
which can be used within
+   * queries submitted to the underlying indexing library to find the best 
results. It's part of the scoring algorithms. 
+   * See scoring.link, scoring.opic, scoring.tld, etc.</li>
+   * </ol>
    * 
    * @param key The key of the page (reversed url).
-   * @param page The webpage.
+   * @param page The {@link Webpage}.
    * @return The indexed document, or null if skipped by index filters.
    */
   public NutchDocument index(String key, WebPage page) {
     NutchDocument doc = new NutchDocument();
     doc.add("id", key);
     doc.add("digest", StringUtil.toHexString(page.getSignature().array()));
-
+    doc.add("batchId", page.getBatchId().toString());
+    
     String url = TableUtil.unreverseUrl(key);
 
     if (LOG.isDebugEnabled()) {

Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java Tue Mar 26 
14:09:27 2013
@@ -36,7 +36,7 @@ public interface Nutch {
 
   public static final String SIGNATURE_KEY = "nutch.content.digest";
 
-  public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+  public static final String BATCH_NAME_KEY = "nutch.batch.name";
 
   public static final String SCORE_KEY = "nutch.crawl.score";
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java Tue Mar 
26 14:09:27 2013
@@ -32,7 +32,6 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.ArrayFile;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.UTF8;
 import org.apache.hadoop.io.VersionMismatchException;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.GenericOptionsParser;
@@ -113,21 +112,21 @@ public final class Content implements Wr
     switch (oldVersion) {
     case 0:
     case 1:
-      url = UTF8.readString(in); // read url
-      base = UTF8.readString(in); // read base
+      url = Text.readString(in); // read url
+      base = Text.readString(in); // read base
 
       content = new byte[in.readInt()]; // read content
       in.readFully(content);
 
-      contentType = UTF8.readString(in); // read contentType
+      contentType = Text.readString(in); // read contentType
       // reconstruct metadata
       int keySize = in.readInt();
       String key;
       for (int i = 0; i < keySize; i++) {
-        key = UTF8.readString(in);
+        key = Text.readString(in);
         int valueSize = in.readInt();
         for (int j = 0; j < valueSize; j++) {
-          metadata.add(key, UTF8.readString(in));
+          metadata.add(key, Text.readString(in));
         }
       }
       break;
@@ -271,7 +270,7 @@ public final class Content implements Wr
 
   public static void main(String args[]) throws Exception {
 
-    String usage = "Content (-local | -dfs <namenode:port>) recno segment";
+    String usage = "Content (-local | -dfs <namenode:port>) recno batchId";
 
     if (args.length < 3) {
       System.out.println("usage:" + usage);
@@ -286,9 +285,9 @@ public final class Content implements Wr
     FileSystem fs = FileSystem.get(conf);
     try {
       int recno = Integer.parseInt(argv[0]);
-      String segment = argv[1];
+      String batchId = argv[1];
 
-      Path file = new Path(segment, DIR_NAME);
+      Path file = new Path(batchId, DIR_NAME);
       System.out.println("Reading from file: " + file);
 
       ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(),

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java Tue Mar 
26 14:09:27 2013
@@ -131,7 +131,7 @@ public class Benchmark extends Configure
       System.err.println("\t-depth NN\tperform NN crawl cycles (default: 10)");
       System.err.println("\t-threads NN\tuse NN threads per Fetcher task 
(default: 10)");
       // XXX what is the equivalent here? not an additional job...
-      // System.err.println("\t-keep\tkeep segment data (default: delete after 
updatedb)");
+      // System.err.println("\t-keep\tkeep batchId data (default: delete after 
updatedb)");
       System.err.println("\t-plugins <regex>\toverride 'plugin.includes'.");
       System.err.println("\tNOTE: if not specified, this is reset to: " + 
plugins);
       System.err.println("\tNOTE: if 'default' is specified then a value set 
in nutch-default/nutch-site is used.");
@@ -201,7 +201,7 @@ public class Benchmark extends Configure
     InjectorJob injector = new InjectorJob(conf);
     GeneratorJob generator = new GeneratorJob(conf);
     FetcherJob fetcher = new FetcherJob(conf);
-    ParserJob parseSegment = new ParserJob(conf);
+    ParserJob parseBatch = new ParserJob(conf);
     DbUpdaterJob crawlDbTool = new DbUpdaterJob(conf);
     // not needed in the new API
     //LinkDb linkDbTool = new LinkDb(getConf());
@@ -212,7 +212,7 @@ public class Benchmark extends Configure
     long delta = System.currentTimeMillis() - start;
     res.addTiming("inject", "0", delta);
     int i;
-    for (i = 0; i < depth; i++) {             // generate new segment
+    for (i = 0; i < depth; i++) {             // generate new batch
       start = System.currentTimeMillis();
       String batchId = generator.generate(topN, System.currentTimeMillis(),
               false, false);
@@ -229,7 +229,7 @@ public class Benchmark extends Configure
       res.addTiming("fetch", i + "", delta);
       if (!isParsing) {
         start = System.currentTimeMillis();
-        parseSegment.parse(batchId, false, false);    // parse it, if needed
+        parseBatch.parse(batchId, false, false);    // parse it, if needed
         delta = System.currentTimeMillis() - start;
         res.addTiming("parse", i + "", delta);
       }

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java 
Tue Mar 26 14:09:27 2013
@@ -32,19 +32,10 @@ package org.apache.nutch.tools.proxy;
  * limitations under the License.
  */
 
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Iterator;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.tools.proxy.FakeHandler.Mode;
-import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.mortbay.jetty.Handler;
 import org.mortbay.jetty.Server;
@@ -85,7 +76,7 @@ public class TestbedProxy {
     }
     
     Configuration conf = NutchConfiguration.create();
-    int port = conf.getInt("segment.proxy.port", 8181);
+    int port = conf.getInt("batch.proxy.port", 8181);
     boolean forward = false;
     boolean fake = false;
     boolean delay = false;

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Tue Mar 26 
14:09:27 2013
@@ -126,7 +126,7 @@ public class URLUtil {
   }
 
   /** Partitions of the hostname of the url by "."  */
-  public static String[] getHostSegments(URL url) {
+  public static String[] getHostBatches(URL url) {
     String host = url.getHost();
     //return whole hostname, if it is an ipv4
     //TODO : handle ipv6
@@ -137,8 +137,8 @@ public class URLUtil {
 
   /** Partitions of the hostname of the url by "."
    * @throws MalformedURLException */
-  public static String[] getHostSegments(String url) throws 
MalformedURLException {
-   return getHostSegments(new URL(url));
+  public static String[] getHostBatches(String url) throws 
MalformedURLException {
+   return getHostBatches(new URL(url));
   }
 
   /**

Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java 
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java Tue 
Mar 26 14:09:27 2013
@@ -247,12 +247,12 @@ public class TestGenerator extends Abstr
    *          number of results to generate
    * @param config
    *          Configuration to use
-   * @return path to generated segment
+   * @return path to generated batch
    * @throws IOException
    */
   private void generateFetchlist(int numResults, Configuration config,
       boolean filter) throws Exception {
-    // generate segment
+    // generate batch
     GeneratorJob g = new GeneratorJob();
     g.setConf(config);
     String batchId = g.generate(numResults, System.currentTimeMillis(), 
filter, false);

Modified: 
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
 (original)
+++ 
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
 Tue Mar 26 14:09:27 2013
@@ -296,7 +296,7 @@ public class TestSpellCheckedMetadata ex
     scmd.add("Accept-Ranges", "bytes");
     scmd.add("ETag", "\"1234567-89-01234567\"");
     scmd.add("Content-Length", "123");
-    scmd.add(Nutch.SEGMENT_NAME_KEY, "segmentzzz");
+    scmd.add(Nutch.BATCH_NAME_KEY, "batchzzz");
     scmd.add(Nutch.SIGNATURE_KEY, "123");
     return scmd;
   }

Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java 
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java Tue Mar 
26 14:09:27 2013
@@ -36,7 +36,7 @@ public class TestURLUtil
 
     URL url = null;
 
-    url = new URL("http://lucene.apache.org/nutch";);
+    url = new URL("http://nutch.apache.org";);
     assertEquals("apache.org", URLUtil.getDomainName(url));
 
     url = new URL("http://en.wikipedia.org/wiki/Java_coffee";);
@@ -133,35 +133,35 @@ public class TestURLUtil
 
   }
 
-  public void testGetHostSegments()
+  public void testGetHostBatches()
     throws Exception {
     URL url;
-    String[] segments;
+    String[] batches;
 
     url = new URL("http://subdomain.example.edu.tr";);
-    segments = URLUtil.getHostSegments(url);
-    assertEquals("subdomain", segments[0]);
-    assertEquals("example", segments[1]);
-    assertEquals("edu", segments[2]);
-    assertEquals("tr", segments[3]);
+    batches = URLUtil.getHostBatches(url);
+    assertEquals("subdomain", batches[0]);
+    assertEquals("example", batches[1]);
+    assertEquals("edu", batches[2]);
+    assertEquals("tr", batches[3]);
 
     url = new URL("http://";);
-    segments = URLUtil.getHostSegments(url);
-    assertEquals(1, segments.length);
-    assertEquals("", segments[0]);
+    batches = URLUtil.getHostBatches(url);
+    assertEquals(1, batches.length);
+    assertEquals("", batches[0]);
 
     url = new URL("http://140.211.11.130/foundation/contributing.html";);
-    segments = URLUtil.getHostSegments(url);
-    assertEquals(1, segments.length);
-    assertEquals("140.211.11.130", segments[0]);
+    batches = URLUtil.getHostBatches(url);
+    assertEquals(1, batches.length);
+    assertEquals("140.211.11.130", batches[0]);
 
     // test non-ascii
     url = new URL("http://www.example.商業.tw";);
-    segments = URLUtil.getHostSegments(url);
-    assertEquals("www", segments[0]);
-    assertEquals("example", segments[1]);
-    assertEquals("商業", segments[2]);
-    assertEquals("tw", segments[3]);
+    batches = URLUtil.getHostBatches(url);
+    assertEquals("www", batches[0]);
+    assertEquals("example", batches[1]);
+    assertEquals("商業", batches[2]);
+    assertEquals("tw", batches[3]);
 
   }
 


Reply via email to