[
https://issues.apache.org/jira/browse/NUTCH-1228?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16453805#comment-16453805
]
ASF GitHub Bot commented on NUTCH-1228:
---------------------------------------
sebastian-nagel closed pull request #319: NUTCH-1228 Change mapred.task.timeout
to mapreduce.task.timeout in fetcher
URL: https://github.com/apache/nutch/pull/319
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/src/bin/crawl b/src/bin/crawl
index 1a31d7d0a..27db6de6c 100644
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -61,7 +61,7 @@ fi
numSlaves=1
# and the total number of available tasks
-# sets Hadoop parameter "mapred.reduce.tasks"
+# sets Hadoop parameter "mapreduce.job.reduces"
numTasks=`expr $numSlaves \* 2`
# number of urls to fetch in one iteration
@@ -88,7 +88,7 @@ fi
# note that some of the options listed here could be set in the
# corresponding hadoop site xml param file
-commonOptions="-D mapred.reduce.tasks=$numTasks -D
mapred.child.java.opts=-Xmx1000m -D
mapred.reduce.tasks.speculative.execution=false -D
mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true"
+commonOptions="-D mapreduce.job.reduces=$numTasks -D
mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D
mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
# check that hadoop can be found on the path
if [ $mode = "distributed" ]; then
@@ -161,7 +161,7 @@ do
echo "Parsing : "
# enable the skipping of records for the parsing so that a dodgy document
# so that it does not fail the full task
- skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D
mapred.skip.map.max.skip.records=1"
+ skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D
mapreduce.map.skip.maxrecords=1"
__bin_nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId
"$CRAWL_ID"
# updatedb with this batch
diff --git a/src/java/org/apache/nutch/crawl/WebTableReader.java
b/src/java/org/apache/nutch/crawl/WebTableReader.java
index 5985dd6cf..941ae9ac4 100644
--- a/src/java/org/apache/nutch/crawl/WebTableReader.java
+++ b/src/java/org/apache/nutch/crawl/WebTableReader.java
@@ -539,7 +539,7 @@ public int run(String[] args) throws Exception {
// for now handles only -stat
@Override
public Map<String, Object> run(Map<String, Object> args) throws Exception {
- Path tmpFolder = new Path(getConf().get("mapred.temp.dir", ".")
+ Path tmpFolder = new Path(getConf().get("mapreduce.cluster.temp.dir", ".")
+ "stat_tmp" + System.currentTimeMillis());
numJobs = 1;
diff --git a/src/java/org/apache/nutch/fetcher/FetcherJob.java
b/src/java/org/apache/nutch/fetcher/FetcherJob.java
index bd06121b2..82e7a126c 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherJob.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherJob.java
@@ -214,7 +214,7 @@ public FetcherJob(Configuration conf) {
StorageUtils.initReducerJob(currentJob, FetcherReducer.class);
if (numTasks == null || numTasks < 1) {
currentJob.setNumReduceTasks(currentJob.getConfiguration().getInt(
- "mapred.map.tasks", currentJob.getNumReduceTasks()));
+ "mapreduce.job.maps", currentJob.getNumReduceTasks()));
} else {
currentJob.setNumReduceTasks(numTasks);
}
@@ -247,7 +247,7 @@ public FetcherJob(Configuration conf) {
* @param shouldResume
* @param numTasks
* number of fetching tasks (reducers). If set to < 1 then use
the
- * default, which is mapred.map.tasks.
+ * default, which is mapreduce.job.maps.
* @return 0 on success
* @throws Exception
*/
@@ -267,7 +267,7 @@ public int fetch(String batchId, int threads, boolean
shouldResume,
* @param shouldResume
* @param numTasks
* number of fetching tasks (reducers). If set to < 1 then use
the
- * default, which is mapred.map.tasks.
+ * default, which is mapreduce.job.maps.
* @param stmDetect
* If set true, sitemap detection is run.
* @param sitemap
@@ -326,7 +326,7 @@ public int run(String[] args) throws Exception {
+ " -crawlId <id> - the id to prefix the schemas to operate on, \n
\t \t (default: storage.crawl.id)\n"
+ " -threads N - number of fetching threads per task\n"
+ " -resume - resume interrupted job\n"
- + " -numTasks N - if N > 0 then use this many reduce tasks for
fetching \n \t \t (default: mapred.map.tasks)"
+ + " -numTasks N - if N > 0 then use this many reduce tasks for
fetching \n \t \t (default: mapreduce.job.maps)"
+ " -sitemap - only sitemap files are fetched, defaults to
false"
+ " -stmDetect - sitemap files are detected from robot.txt file";
diff --git a/src/java/org/apache/nutch/fetcher/FetcherReducer.java
b/src/java/org/apache/nutch/fetcher/FetcherReducer.java
index 364bf7e38..4f7195458 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherReducer.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherReducer.java
@@ -842,7 +842,7 @@ public void run(Context context) throws IOException,
InterruptedException {
ft.start();
}
// select a timeout that avoids a task timeout
- final long timeout = conf.getInt("mapred.task.timeout", 10 * 60 * 1000) /
2;
+ final long timeout = conf.getInt("mapreduce.task.timeout", 10 * 60 * 1000)
/ 2;
// Used for threshold check, holds pages and bytes processed in the last
sec
float pagesLastSec;
diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java
b/src/java/org/apache/nutch/indexer/CleaningJob.java
index eaa842060..4869efab9 100644
--- a/src/java/org/apache/nutch/indexer/CleaningJob.java
+++ b/src/java/org/apache/nutch/indexer/CleaningJob.java
@@ -138,7 +138,7 @@ public void cleanup(Context context) throws IOException {
getConf().setBoolean(ARG_COMMIT, (Boolean) args.get(ARG_COMMIT));
currentJob = NutchJob.getInstance(getConf(), "CleaningJob");
currentJob.getConfiguration().setClass(
- "mapred.output.key.comparator.class", StringComparator.class,
+ "mapreduce.job.output.key.comparator.class", StringComparator.class,
RawComparator.class);
Collection<WebPage.Field> fields = getFields(currentJob);
diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java
b/src/java/org/apache/nutch/indexer/IndexingJob.java
index f98d40dcb..ec0438431 100644
--- a/src/java/org/apache/nutch/indexer/IndexingJob.java
+++ b/src/java/org/apache/nutch/indexer/IndexingJob.java
@@ -145,7 +145,7 @@ public void map(String key, WebPage page, Context context)
Job job = NutchJob.getInstance(conf, "Indexer");
// TODO: Figure out why this needs to be here
- job.getConfiguration().setClass("mapred.output.key.comparator.class",
+
job.getConfiguration().setClass("mapreduce.job.output.key.comparator.class",
StringComparator.class, RawComparator.class);
Collection<WebPage.Field> fields = getFields(job);
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> Change mapred.task.timeout to mapreduce.task.timeout in fetcher
> ---------------------------------------------------------------
>
> Key: NUTCH-1228
> URL: https://issues.apache.org/jira/browse/NUTCH-1228
> Project: Nutch
> Issue Type: Task
> Components: fetcher
> Reporter: Markus Jelsma
> Assignee: Markus Jelsma
> Priority: Trivial
> Fix For: 2.4
>
> Attachments: NUTCH-1228-2.1.patch
>
>
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)