Author: markus
Date: Tue Jun 12 10:18:09 2012
New Revision: 1349230
URL: http://svn.apache.org/viewvc?rev=1349230&view=rev
Log:
NUTCH-1356 ParseUtil use ExecutorService instead of manually thread handling
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1349230&r1=1349229&r2=1349230&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jun 12 10:18:09 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1356 ParseUtil use ExecutorService instead of manually thread handling
(ferdy via markus)
+
* NUTCH-1352 Improve regex urlfilters/normalizers synchronization (ferdy via
markus)
* NUTCH-1024 Dynamically set fetchInterval by MIME-type (markus)
Modified: nutch/trunk/ivy/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1349230&r1=1349229&r2=1349230&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Tue Jun 12 10:18:09 2012
@@ -62,6 +62,8 @@
<dependency org="xerces" name="xercesImpl" rev="2.9.1" />
<dependency org="xerces" name="xmlParserAPIs" rev="2.6.2" />
<dependency org="oro" name="oro" rev="2.0.8" />
+
+ <dependency org="com.google.guava" name="guava" rev="11.0.2" />
<!--Configuration: test -->
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1349230&r1=1349229&r2=1349230&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Tue Jun 12
10:18:09 2012
@@ -18,15 +18,18 @@ package org.apache.nutch.parse;
// Commons Logging imports
-import java.util.concurrent.FutureTask;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.protocol.Content;
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
+
/**
* A Utility class containing methods to simply perform parsing utilities such
@@ -43,7 +46,8 @@ public class ParseUtil {
public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class);
private ParserFactory parserFactory;
/** Parser timeout set to 30 sec by default. Set -1 to deactivate **/
- private int MAX_PARSE_TIME = 30;
+ private int maxParseTime = 30;
+ private ExecutorService executorService;
/**
*
@@ -51,7 +55,9 @@ public class ParseUtil {
*/
public ParseUtil(Configuration conf) {
this.parserFactory = new ParserFactory(conf);
- MAX_PARSE_TIME=conf.getInt("parser.timeout", 30);
+ maxParseTime=conf.getInt("parser.timeout", 30);
+ executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder()
+ .setNameFormat("parse-%d").setDaemon(true).build());
}
/**
@@ -83,7 +89,7 @@ public class ParseUtil {
if (LOG.isDebugEnabled()) {
LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i] +
"]");
}
- if (MAX_PARSE_TIME!=-1)
+ if (maxParseTime!=-1)
parseResult = runParser(parsers[i], content);
else
parseResult = parsers[i].getParse(content);
@@ -133,7 +139,7 @@ public class ParseUtil {
}
ParseResult parseResult = null;
- if (MAX_PARSE_TIME!=-1)
+ if (maxParseTime!=-1)
parseResult = runParser(p, content);
else
parseResult = p.getParse(content);
@@ -150,20 +156,14 @@ public class ParseUtil {
private ParseResult runParser(Parser p, Content content) {
ParseCallable pc = new ParseCallable(p, content);
- FutureTask<ParseResult> task = new FutureTask<ParseResult>(pc);
+ Future<ParseResult> task = executorService.submit(pc);
ParseResult res = null;
- Thread t = new Thread(task);
- t.start();
try {
- res = task.get(MAX_PARSE_TIME, TimeUnit.SECONDS);
- } catch (TimeoutException e) {
- LOG.warn("TIMEOUT parsing " + content.getUrl() + " with " + p);
+ res = task.get(maxParseTime, TimeUnit.SECONDS);
} catch (Exception e) {
+ LOG.warn("Error parsing " + content.getUrl() + " with " + p, e);
task.cancel(true);
- res = null;
- t.interrupt();
} finally {
- t = null;
pc = null;
}
return res;