Author: snagel Date: Thu Jan 7 20:57:13 2016 New Revision: 1723626 URL: http://svn.apache.org/viewvc?rev=1723626&view=rev Log: NUTCH-2143 GeneratorJob ignores batch id passed as argument
Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1723626&r1=1723625&r2=1723626&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Thu Jan 7 20:57:13 2016 @@ -3,6 +3,8 @@ Nutch Change Log Nutch 2.3.1 Release 22092015 (ddmmyyyy) Release Report - http://s.apache.org/nutch_2.3.1 +* NUTCH-2143 GeneratorJob ignores batch id passed as argument (liuqibj, lewismc, snagel) + * NUTCH-2042 parse-html increase chunk size used to detect charset (snagel) * NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel) Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1723626&r1=1723625&r2=1723626&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Thu Jan 7 20:57:13 2016 @@ -163,17 +163,20 @@ public class GeneratorJob extends NutchT return fields; } + /** Generate a random batch id */ + public static String randomBatchId() { + long curTime = System.currentTimeMillis(); + int randomSeed = Math.abs(new Random().nextInt()); + String batchId = (curTime / 1000) + "-" + randomSeed; + return batchId; + } + public Map<String, Object> run(Map<String, Object> args) throws Exception { String batchId = (String) args.get(Nutch.ARG_BATCH); - if (batchId != null) { - getConf().set(GeneratorJob.BATCH_ID, batchId); - } else { - // generate batchId - long curTime = System.currentTimeMillis(); - int randomSeed = Math.abs(new Random().nextInt()); - batchId = (curTime / 1000) + "-" + randomSeed; - getConf().set(BATCH_ID, batchId); + if (batchId == null) { + batchId = randomBatchId(); } + getConf().set(BATCH_ID, batchId); // map to inverted subset due for fetch, sort by score Long topN = null; @@ -249,10 +252,15 @@ public class GeneratorJob extends NutchT if (topN != Long.MAX_VALUE) { LOG.info("GeneratorJob: topN: " + topN); } + String batchId = getConf().get(BATCH_ID); Map<String, Object> results = run(ToolUtil.toArgMap(Nutch.ARG_TOPN, topN, Nutch.ARG_CURTIME, curTime, Nutch.ARG_FILTER, filter, - Nutch.ARG_NORMALIZE, norm)); - String batchId = getConf().get(BATCH_ID); + Nutch.ARG_NORMALIZE, norm, Nutch.ARG_BATCH, batchId)); + if (batchId == null) { + // use generated random batch id + batchId = (String) results.get(BATCH_ID); + } + long finish = System.currentTimeMillis(); long generateCount = (Long) results.get(GENERATE_COUNT); LOG.info("GeneratorJob: finished at " + sdf.format(finish) @@ -290,11 +298,6 @@ public class GeneratorJob extends NutchT long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE; boolean filter = true, norm = true; - // generate batchId - int randomSeed = Math.abs(new Random().nextInt()); - String batchId = (curTime / 1000) + "-" + randomSeed; - getConf().set(BATCH_ID, batchId); - for (int i = 0; i < args.length; i++) { if ("-topN".equals(args[i])) { topN = Long.parseLong(args[++i]); @@ -307,9 +310,9 @@ public class GeneratorJob extends NutchT } else if ("-adddays".equals(args[i])) { long numDays = Integer.parseInt(args[++i]); curTime += numDays * 1000L * 60 * 60 * 24; - } else if ("-batchId".equals(args[i])) + } else if ("-batchId".equals(args[i])) { getConf().set(BATCH_ID, args[++i]); - else { + } else { System.err.println("Unrecognized arg " + args[i]); return -1; }