Author: ferdy
Date: Tue Aug 14 07:30:29 2012
New Revision: 1372752
URL: http://svn.apache.org/viewvc?rev=1372752&view=rev
Log:
NUTCH-1365 Fix crawlId functionalilty by making using of new gora configuration
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1372752&r1=1372751&r2=1372752&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Aug 14 07:30:29 2012
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.1 - Current Development
+* NUTCH-1365 Fix crawlId functionalilty by making using of new gora
configuration (ferdy)
+
* NUTCH-1442 indexingfilter.order is property is misread in code (ferdy via
lewismc)
* NUTCH-1450 Upgrade to gora deps to 0.2.1 (lewismc)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java?rev=1372752&r1=1372751&r2=1372752&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java Tue
Aug 14 07:30:29 2012
@@ -150,7 +150,7 @@ public class HostInjectorJob implements
job.setMapOutputValueClass(Host.class);
job.setOutputFormatClass(GoraOutputFormat.class);
GoraOutputFormat.setOutput(job,
- StorageUtils.createWebStore(getConf(), String.class, Host.class),
true);
+ StorageUtils.createWebStore(job.getConfiguration(), String.class,
Host.class), true);
job.setReducerClass(Reducer.class);
job.setNumReduceTasks(0);
return job.waitForCompletion(true);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java?rev=1372752&r1=1372751&r2=1372752&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java Tue
Aug 14 07:30:29 2012
@@ -33,6 +33,10 @@ import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.nutch.metadata.Nutch;
+/**
+ * Entry point to Gora store/mapreduce functionality.
+ * Translates the concept of "crawlid" to the corresponding Gora support.
+ */
public class StorageUtils {
/** Creates a store for the given persistentClass.
@@ -61,7 +65,9 @@ public class StorageUtils {
String crawlId = conf.get(Nutch.CRAWL_ID_KEY, "");
if (!crawlId.isEmpty()) {
- schema = crawlId + "_" + schema;
+ conf.set("schema.prefix", crawlId + "_");
+ } else {
+ conf.set("schema.prefix", "");
}
Class<? extends DataStore<K, V>> dataStoreClass =
@@ -71,7 +77,7 @@ public class StorageUtils {
}
@SuppressWarnings("unchecked")
- public static <K, V extends Persistent> Class<? extends DataStore<K, V>>
+ private static <K, V extends Persistent> Class<? extends DataStore<K, V>>
getDataStoreClass(Configuration conf) throws ClassNotFoundException {
return (Class<? extends DataStore<K, V>>)
Class.forName(conf.get("storage.data.store.class",
@@ -81,15 +87,6 @@ public class StorageUtils {
public static <K, V> void initMapperJob(Job job,
Collection<WebPage.Field> fields,
Class<K> outKeyClass, Class<V> outValueClass,
- Class<? extends GoraMapper<String, WebPage, K, V>> mapperClass, boolean
reuseObjects)
- throws ClassNotFoundException, IOException {
- initMapperJob(job, fields, outKeyClass, outValueClass,
- mapperClass, null, reuseObjects);
- }
-
- public static <K, V> void initMapperJob(Job job,
- Collection<WebPage.Field> fields,
- Class<K> outKeyClass, Class<V> outValueClass,
Class<? extends GoraMapper<String, WebPage, K, V>> mapperClass)
throws ClassNotFoundException, IOException {
initMapperJob(job, fields, outKeyClass, outValueClass,
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java?rev=1372752&r1=1372751&r2=1372752&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java Tue Aug 14
07:30:29 2012
@@ -20,6 +20,7 @@ package org.apache.nutch.util;
import java.io.IOException;
import org.apache.avro.util.Utf8;
+import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.nutch.metadata.Nutch;
@@ -34,6 +35,12 @@ public class NutchJob extends Job {
public NutchJob(Configuration conf, String jobName) throws IOException {
super(conf, jobName);
+ //prefix jobName with crawlId if not empty
+ String crawlId = conf.get("storage.crawl.id");
+ if (!StringUtils.isEmpty(crawlId)) {
+ jobName = "["+crawlId+"]"+jobName;
+ setJobName(jobName);
+ }
setJarByClass(this.getClass());
}