Author: fenglu
Date: Tue Mar 26 14:09:27 2013
New Revision: 1461140
URL: http://svn.apache.org/r1461140
Log:
NUTCH-1532 Replace 'segment' mapping field with batchId
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/schema-solr4.xml
nutch/branches/2.x/conf/schema.xml
nutch/branches/2.x/conf/solrindex-mapping.xml
nutch/branches/2.x/src/bin/nutch
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java
nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Mar 26 14:09:27 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.2 - Current Development
+* NUTCH-1532 Replace 'segment' mapping field with batchId (Feng +via lewismc)
+
* NUTCH-1533 Implement getPrevModifiedTime(), setPrevModifiedTime(),
getBatchId() and setBatchId() accessors in o.a.n.storage.WebPage (Feng via
lewismc)
* NUTCH-XX fix Elastic Search Ivy configuration (Binoy d via lewismc)
Modified: nutch/branches/2.x/conf/schema-solr4.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema-solr4.xml?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema-solr4.xml (original)
+++ nutch/branches/2.x/conf/schema-solr4.xml Tue Mar 26 14:09:27 2013
@@ -304,7 +304,7 @@
<field name="id" type="string" stored="true" indexed="true"/>
<!-- core fields -->
- <field name="segment" type="string" stored="true" indexed="false"/>
+ <field name="batchId" type="string" stored="true" indexed="false"/>
<field name="digest" type="string" stored="true" indexed="false"/>
<field name="boost" type="float" stored="true" indexed="false"/>
Modified: nutch/branches/2.x/conf/schema.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Tue Mar 26 14:09:27 2013
@@ -69,7 +69,7 @@
<field name="id" type="string" stored="true" indexed="true"/>
<!-- core fields -->
- <field name="segment" type="string" stored="true" indexed="false"/>
+ <field name="batchId" type="string" stored="true" indexed="false"/>
<field name="digest" type="string" stored="true" indexed="false"/>
<field name="boost" type="float" stored="true" indexed="false"/>
Modified: nutch/branches/2.x/conf/solrindex-mapping.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/solrindex-mapping.xml?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/conf/solrindex-mapping.xml (original)
+++ nutch/branches/2.x/conf/solrindex-mapping.xml Tue Mar 26 14:09:27 2013
@@ -34,7 +34,7 @@
<field dest="content" source="content"/>
<field dest="title" source="title"/>
<field dest="host" source="host"/>
- <field dest="segment" source="segment"/>
+ <field dest="batchId" source="batchId"/>
<field dest="boost" source="boost"/>
<field dest="digest" source="digest"/>
<field dest="tstamp" source="tstamp"/>
Modified: nutch/branches/2.x/src/bin/nutch
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/nutch?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/nutch (original)
+++ nutch/branches/2.x/src/bin/nutch Tue Mar 26 14:09:27 2013
@@ -50,7 +50,7 @@ if [ $# = 0 ]; then
# echo " crawl one-step crawler for intranets"
echo " inject inject new urls into the database"
echo " hostinject creates or updates an existing host table from a text
file"
- echo " generate generate new segments to fetch from crawl db"
+ echo " generate generate new batches to fetch from crawl db"
echo " fetch fetch URLs marked during generate"
echo " parse parse URLs marked during fetch"
echo " updatedb update web table after parsing"
@@ -58,7 +58,7 @@ if [ $# = 0 ]; then
echo " readdb read/dump records from page database"
echo " readhostdb display entries from the hostDB"
echo " elasticindex run the elasticsearch indexer"
- echo " solrindex run the solr indexer on parsed segments and linkdb"
+ echo " solrindex run the solr indexer on parsed batches"
echo " solrdedup remove duplicates from solr"
echo " parsechecker check the parser for a given url"
echo " plugin load a plugin and run one of its classes main()"
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
Tue Mar 26 14:09:27 2013
@@ -169,7 +169,7 @@ implements FetchSchedule {
public boolean shouldFetch(String url, WebPage page, long curTime) {
// pages are never truly GONE - we have to check them from time to time.
// pages with too long fetchInterval are adjusted so that they fit within
- // maximum fetchInterval (segment retention period).
+ // maximum fetchInterval (batch retention period).
long fetchTime = page.getFetchTime();
if (fetchTime - curTime > maxInterval * 1000L) {
if (page.getFetchInterval() > maxInterval) {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java Tue Mar
26 14:09:27 2013
@@ -43,17 +43,28 @@ public class IndexUtil {
}
/**
- * Index a webpage.
+ * Index a {@link Webpage}, here we add the following fields:
+ * <ol>
+ * <li><tt>id</tt>: default uniqueKey for the {@link NutchDocument}.</li>
+ * <li><tt>digest</tt>: Digest is used to identify pages (like unique ID)
and is used to remove
+ * duplicates during the dedup procedure. It is calculated using {@link
org.apache.nutch.crawl.MD5Signature} or
+ * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
+ * <li><tt>batchId</tt>: The page belongs to a unique batchId, this is its
identifier.</li>
+ * <li><tt>boost</tt>: Boost is used to calculate document (field) score
which can be used within
+ * queries submitted to the underlying indexing library to find the best
results. It's part of the scoring algorithms.
+ * See scoring.link, scoring.opic, scoring.tld, etc.</li>
+ * </ol>
*
* @param key The key of the page (reversed url).
- * @param page The webpage.
+ * @param page The {@link Webpage}.
* @return The indexed document, or null if skipped by index filters.
*/
public NutchDocument index(String key, WebPage page) {
NutchDocument doc = new NutchDocument();
doc.add("id", key);
doc.add("digest", StringUtil.toHexString(page.getSignature().array()));
-
+ doc.add("batchId", page.getBatchId().toString());
+
String url = TableUtil.unreverseUrl(key);
if (LOG.isDebugEnabled()) {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java Tue Mar 26
14:09:27 2013
@@ -36,7 +36,7 @@ public interface Nutch {
public static final String SIGNATURE_KEY = "nutch.content.digest";
- public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+ public static final String BATCH_NAME_KEY = "nutch.batch.name";
public static final String SCORE_KEY = "nutch.crawl.score";
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java Tue Mar
26 14:09:27 2013
@@ -32,7 +32,6 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayFile;
import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.UTF8;
import org.apache.hadoop.io.VersionMismatchException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.GenericOptionsParser;
@@ -113,21 +112,21 @@ public final class Content implements Wr
switch (oldVersion) {
case 0:
case 1:
- url = UTF8.readString(in); // read url
- base = UTF8.readString(in); // read base
+ url = Text.readString(in); // read url
+ base = Text.readString(in); // read base
content = new byte[in.readInt()]; // read content
in.readFully(content);
- contentType = UTF8.readString(in); // read contentType
+ contentType = Text.readString(in); // read contentType
// reconstruct metadata
int keySize = in.readInt();
String key;
for (int i = 0; i < keySize; i++) {
- key = UTF8.readString(in);
+ key = Text.readString(in);
int valueSize = in.readInt();
for (int j = 0; j < valueSize; j++) {
- metadata.add(key, UTF8.readString(in));
+ metadata.add(key, Text.readString(in));
}
}
break;
@@ -271,7 +270,7 @@ public final class Content implements Wr
public static void main(String args[]) throws Exception {
- String usage = "Content (-local | -dfs <namenode:port>) recno segment";
+ String usage = "Content (-local | -dfs <namenode:port>) recno batchId";
if (args.length < 3) {
System.out.println("usage:" + usage);
@@ -286,9 +285,9 @@ public final class Content implements Wr
FileSystem fs = FileSystem.get(conf);
try {
int recno = Integer.parseInt(argv[0]);
- String segment = argv[1];
+ String batchId = argv[1];
- Path file = new Path(segment, DIR_NAME);
+ Path file = new Path(batchId, DIR_NAME);
System.out.println("Reading from file: " + file);
ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(),
Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java Tue Mar
26 14:09:27 2013
@@ -131,7 +131,7 @@ public class Benchmark extends Configure
System.err.println("\t-depth NN\tperform NN crawl cycles (default: 10)");
System.err.println("\t-threads NN\tuse NN threads per Fetcher task
(default: 10)");
// XXX what is the equivalent here? not an additional job...
- // System.err.println("\t-keep\tkeep segment data (default: delete after
updatedb)");
+ // System.err.println("\t-keep\tkeep batchId data (default: delete after
updatedb)");
System.err.println("\t-plugins <regex>\toverride 'plugin.includes'.");
System.err.println("\tNOTE: if not specified, this is reset to: " +
plugins);
System.err.println("\tNOTE: if 'default' is specified then a value set
in nutch-default/nutch-site is used.");
@@ -201,7 +201,7 @@ public class Benchmark extends Configure
InjectorJob injector = new InjectorJob(conf);
GeneratorJob generator = new GeneratorJob(conf);
FetcherJob fetcher = new FetcherJob(conf);
- ParserJob parseSegment = new ParserJob(conf);
+ ParserJob parseBatch = new ParserJob(conf);
DbUpdaterJob crawlDbTool = new DbUpdaterJob(conf);
// not needed in the new API
//LinkDb linkDbTool = new LinkDb(getConf());
@@ -212,7 +212,7 @@ public class Benchmark extends Configure
long delta = System.currentTimeMillis() - start;
res.addTiming("inject", "0", delta);
int i;
- for (i = 0; i < depth; i++) { // generate new segment
+ for (i = 0; i < depth; i++) { // generate new batch
start = System.currentTimeMillis();
String batchId = generator.generate(topN, System.currentTimeMillis(),
false, false);
@@ -229,7 +229,7 @@ public class Benchmark extends Configure
res.addTiming("fetch", i + "", delta);
if (!isParsing) {
start = System.currentTimeMillis();
- parseSegment.parse(batchId, false, false); // parse it, if needed
+ parseBatch.parse(batchId, false, false); // parse it, if needed
delta = System.currentTimeMillis() - start;
res.addTiming("parse", i + "", delta);
}
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java
Tue Mar 26 14:09:27 2013
@@ -32,19 +32,10 @@ package org.apache.nutch.tools.proxy;
* limitations under the License.
*/
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Iterator;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.tools.proxy.FakeHandler.Mode;
-import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.mortbay.jetty.Handler;
import org.mortbay.jetty.Server;
@@ -85,7 +76,7 @@ public class TestbedProxy {
}
Configuration conf = NutchConfiguration.create();
- int port = conf.getInt("segment.proxy.port", 8181);
+ int port = conf.getInt("batch.proxy.port", 8181);
boolean forward = false;
boolean fake = false;
boolean delay = false;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Tue Mar 26
14:09:27 2013
@@ -126,7 +126,7 @@ public class URLUtil {
}
/** Partitions of the hostname of the url by "." */
- public static String[] getHostSegments(URL url) {
+ public static String[] getHostBatches(URL url) {
String host = url.getHost();
//return whole hostname, if it is an ipv4
//TODO : handle ipv6
@@ -137,8 +137,8 @@ public class URLUtil {
/** Partitions of the hostname of the url by "."
* @throws MalformedURLException */
- public static String[] getHostSegments(String url) throws
MalformedURLException {
- return getHostSegments(new URL(url));
+ public static String[] getHostBatches(String url) throws
MalformedURLException {
+ return getHostBatches(new URL(url));
}
/**
Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java Tue
Mar 26 14:09:27 2013
@@ -247,12 +247,12 @@ public class TestGenerator extends Abstr
* number of results to generate
* @param config
* Configuration to use
- * @return path to generated segment
+ * @return path to generated batch
* @throws IOException
*/
private void generateFetchlist(int numResults, Configuration config,
boolean filter) throws Exception {
- // generate segment
+ // generate batch
GeneratorJob g = new GeneratorJob();
g.setConf(config);
String batchId = g.generate(numResults, System.currentTimeMillis(),
filter, false);
Modified:
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
---
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
(original)
+++
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
Tue Mar 26 14:09:27 2013
@@ -296,7 +296,7 @@ public class TestSpellCheckedMetadata ex
scmd.add("Accept-Ranges", "bytes");
scmd.add("ETag", "\"1234567-89-01234567\"");
scmd.add("Content-Length", "123");
- scmd.add(Nutch.SEGMENT_NAME_KEY, "segmentzzz");
+ scmd.add(Nutch.BATCH_NAME_KEY, "batchzzz");
scmd.add(Nutch.SIGNATURE_KEY, "123");
return scmd;
}
Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1461140&r1=1461139&r2=1461140&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java Tue Mar
26 14:09:27 2013
@@ -36,7 +36,7 @@ public class TestURLUtil
URL url = null;
- url = new URL("http://lucene.apache.org/nutch");
+ url = new URL("http://nutch.apache.org");
assertEquals("apache.org", URLUtil.getDomainName(url));
url = new URL("http://en.wikipedia.org/wiki/Java_coffee");
@@ -133,35 +133,35 @@ public class TestURLUtil
}
- public void testGetHostSegments()
+ public void testGetHostBatches()
throws Exception {
URL url;
- String[] segments;
+ String[] batches;
url = new URL("http://subdomain.example.edu.tr");
- segments = URLUtil.getHostSegments(url);
- assertEquals("subdomain", segments[0]);
- assertEquals("example", segments[1]);
- assertEquals("edu", segments[2]);
- assertEquals("tr", segments[3]);
+ batches = URLUtil.getHostBatches(url);
+ assertEquals("subdomain", batches[0]);
+ assertEquals("example", batches[1]);
+ assertEquals("edu", batches[2]);
+ assertEquals("tr", batches[3]);
url = new URL("http://");
- segments = URLUtil.getHostSegments(url);
- assertEquals(1, segments.length);
- assertEquals("", segments[0]);
+ batches = URLUtil.getHostBatches(url);
+ assertEquals(1, batches.length);
+ assertEquals("", batches[0]);
url = new URL("http://140.211.11.130/foundation/contributing.html");
- segments = URLUtil.getHostSegments(url);
- assertEquals(1, segments.length);
- assertEquals("140.211.11.130", segments[0]);
+ batches = URLUtil.getHostBatches(url);
+ assertEquals(1, batches.length);
+ assertEquals("140.211.11.130", batches[0]);
// test non-ascii
url = new URL("http://www.example.忥.tw");
- segments = URLUtil.getHostSegments(url);
- assertEquals("www", segments[0]);
- assertEquals("example", segments[1]);
- assertEquals("忥", segments[2]);
- assertEquals("tw", segments[3]);
+ batches = URLUtil.getHostBatches(url);
+ assertEquals("www", batches[0]);
+ assertEquals("example", batches[1]);
+ assertEquals("忥", batches[2]);
+ assertEquals("tw", batches[3]);
}