This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 8431dcfe5 NUTCH-3013 Employ commons-lang3's StopWatch to simplify
timing logic (#788)
8431dcfe5 is described below
commit 8431dcfe52f5395a0fd9e3c00db009dbb2bcf6f5
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Sat Oct 21 11:09:31 2023 -0700
NUTCH-3013 Employ commons-lang3's StopWatch to simplify timing logic (#788)
---
.github/workflows/master-build.yml | 1 -
.gitignore | 1 +
src/java/org/apache/nutch/crawl/CrawlDb.java | 19 +++++++++--------
src/java/org/apache/nutch/crawl/CrawlDbMerger.java | 16 +++++++--------
.../org/apache/nutch/crawl/DeduplicationJob.java | 16 +++++++--------
src/java/org/apache/nutch/crawl/Generator.java | 17 +++++++--------
src/java/org/apache/nutch/crawl/Injector.java | 16 +++++++--------
src/java/org/apache/nutch/crawl/LinkDb.java | 15 +++++++-------
src/java/org/apache/nutch/crawl/LinkDbMerger.java | 16 +++++++--------
src/java/org/apache/nutch/crawl/LinkDbReader.java | 24 ++++++++++------------
src/java/org/apache/nutch/fetcher/Fetcher.java | 17 +++++++--------
src/java/org/apache/nutch/hostdb/ReadHostDb.java | 15 +++++++-------
src/java/org/apache/nutch/hostdb/UpdateHostDb.java | 16 +++++++--------
src/java/org/apache/nutch/indexer/CleaningJob.java | 16 +++++++--------
src/java/org/apache/nutch/indexer/IndexingJob.java | 16 +++++++--------
src/java/org/apache/nutch/parse/ParseSegment.java | 21 ++++++++-----------
.../apache/nutch/scoring/webgraph/LinkDumper.java | 17 +++++++--------
.../apache/nutch/scoring/webgraph/LinkRank.java | 16 +++++++--------
.../apache/nutch/scoring/webgraph/NodeDumper.java | 16 +++++++--------
.../nutch/scoring/webgraph/ScoreUpdater.java | 16 +++++++--------
.../apache/nutch/scoring/webgraph/WebGraph.java | 24 ++++++++++------------
src/java/org/apache/nutch/tools/FreeGenerator.java | 16 +++++++--------
.../apache/nutch/tools/arc/ArcSegmentCreator.java | 16 +++++++--------
.../org/apache/nutch/tools/warc/WARCExporter.java | 15 +++++++-------
.../apache/nutch/util/CrawlCompletionStats.java | 15 +++++++-------
.../nutch/util/ProtocolStatusStatistics.java | 19 ++++++++---------
.../org/apache/nutch/util/SitemapProcessor.java | 12 +++++++----
.../apache/nutch/util/domain/DomainStatistics.java | 16 +++++++--------
.../urlfilter/api/RegexURLFilterBaseTest.java | 11 +++++-----
.../regex/TestRegexURLNormalizer.java | 8 ++++++--
30 files changed, 234 insertions(+), 225 deletions(-)
diff --git a/.github/workflows/master-build.yml
b/.github/workflows/master-build.yml
index e3ed11c86..ba1d470ec 100644
--- a/.github/workflows/master-build.yml
+++ b/.github/workflows/master-build.yml
@@ -22,7 +22,6 @@ on:
branches: [ master ]
pull_request:
branches: [ master ]
-
jobs:
build:
diff --git a/.gitignore b/.gitignore
index 0612a99c2..b46690852 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ naivebayes-model
csvindexwriter
lib/spotbugs-*
ivy/dependency-check-ant/*
+.gradle*
diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java
b/src/java/org/apache/nutch/crawl/CrawlDb.java
index 3819bb3a0..16394832b 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDb.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -19,14 +19,15 @@ package org.apache.nutch.crawl;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -49,7 +50,6 @@ import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.TimingUtil;
/**
* This class takes the output of the fetcher and updates the crawldb
@@ -85,10 +85,11 @@ public class CrawlDb extends NutchTool implements Tool {
public void update(Path crawlDb, Path[] segments, boolean normalize,
boolean filter, boolean additionsAllowed, boolean force)
throws IOException, InterruptedException, ClassNotFoundException {
- Path lock = lock(getConf(), crawlDb, force);
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+
+ Path lock = lock(getConf(), crawlDb, force);
Job job = CrawlDb.createJob(getConf(), crawlDb);
Configuration conf = job.getConfiguration();
@@ -98,7 +99,7 @@ public class CrawlDb extends NutchTool implements Tool {
boolean url404Purging = conf.getBoolean(CRAWLDB_PURGE_404, false);
- LOG.info("CrawlDb update: starting at {}", sdf.format(start));
+ LOG.info("CrawlDb update: starting");
LOG.info("CrawlDb update: db: {}", crawlDb);
LOG.info("CrawlDb update: segments: {}", Arrays.asList(segments));
LOG.info("CrawlDb update: additions allowed: {}", additionsAllowed);
@@ -151,9 +152,9 @@ public class CrawlDb extends NutchTool implements Tool {
urlsFiltered);
}
- long end = System.currentTimeMillis();
- LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("CrawlDb update: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
/*
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
index 70c65135e..1bf7243d3 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
@@ -18,11 +18,12 @@ package org.apache.nutch.crawl;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Map.Entry;
import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -44,7 +45,6 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
/**
* This tool merges several CrawlDb-s into one, optionally filtering URLs
@@ -129,9 +129,9 @@ public class CrawlDbMerger extends Configured implements
Tool {
throws Exception {
Path lock = CrawlDb.lock(getConf(), output, false);
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("CrawlDb merge: starting at {}", sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("CrawlDb merge: starting");
Job job = createMergeJob(getConf(), output, normalize, filter);
for (int i = 0; i < dbs.length; i++) {
@@ -155,9 +155,9 @@ public class CrawlDbMerger extends Configured implements
Tool {
NutchJob.cleanupAfterFailure(outPath, lock, fs);
throw e;
}
- long end = System.currentTimeMillis();
- LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("CrawlDb merge: finished, elapsed: {}", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
public static Job createMergeJob(Configuration conf, Path output,
diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index ae5ac37ce..217005d41 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -21,11 +21,12 @@ import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
-import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -48,7 +49,6 @@ import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -298,9 +298,9 @@ public class DeduplicationJob extends NutchTool implements
Tool {
}
}
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("DeduplicationJob: starting at " + sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("DeduplicationJob: starting");
Path tempDir = new Path(crawlDb, "dedup-temp-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
@@ -381,9 +381,9 @@ public class DeduplicationJob extends NutchTool implements
Tool {
// clean up
fs.delete(tempDir, true);
- long end = System.currentTimeMillis();
- LOG.info("Deduplication finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("Deduplication finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
return 0;
}
diff --git a/src/java/org/apache/nutch/crawl/Generator.java
b/src/java/org/apache/nutch/crawl/Generator.java
index d1569e1f0..1b62314e7 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -30,7 +30,9 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configurable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -76,7 +78,6 @@ import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.SegmentReaderUtil;
-import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
/**
@@ -821,10 +822,10 @@ public class Generator extends NutchTool implements Tool {
Path lock = CrawlDb.lock(getConf(), dbDir, force);
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("Generator: starting at " + sdf.format(start));
- LOG.info("Generator: Selecting best-scoring urls due for fetch.");
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("Generator: starting");
+ LOG.info("Generator: selecting best-scoring urls due for fetch.");
LOG.info("Generator: filtering: " + filter);
LOG.info("Generator: normalizing: " + norm);
if (topN != Long.MAX_VALUE) {
@@ -982,9 +983,9 @@ public class Generator extends NutchTool implements Tool {
}
fs.delete(tempDir, true);
- long end = System.currentTimeMillis();
- LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("Generator: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
Path[] patharray = new Path[generatedSegments.size()];
return generatedSegments.toArray(patharray);
diff --git a/src/java/org/apache/nutch/crawl/Injector.java
b/src/java/org/apache/nutch/crawl/Injector.java
index 9fca719f6..9bfd1b454 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -16,6 +16,7 @@
*/
package org.apache.nutch.crawl;
+import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
@@ -45,17 +46,16 @@ import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
+import java.util.concurrent.TimeUnit;
/**
* Injector takes a flat text file of URLs (or a folder containing text files)
@@ -372,10 +372,11 @@ public class Injector extends NutchTool implements Tool {
boolean update, boolean normalize, boolean filter,
boolean filterNormalizeAll)
throws IOException, ClassNotFoundException, InterruptedException {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("Injector: starting at {}", sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+
+ LOG.info("Injector: starting");
LOG.info("Injector: crawlDb: {}", crawlDb);
LOG.info("Injector: urlDir: {}", urlDir);
LOG.info("Injector: Converting injected urls to crawl db entries.");
@@ -479,9 +480,8 @@ public class Injector extends NutchTool implements Tool {
urlsPurged404);
}
- long end = System.currentTimeMillis();
- LOG.info("Injector: finished at {}, elapsed: {}", sdf.format(end),
- TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("Injector: finished, elapsed: {} ms",
stopWatch.getTime(TimeUnit.MILLISECONDS));
}
} catch (IOException | InterruptedException | ClassNotFoundException |
NullPointerException e) {
LOG.error("Injector job failed: {}", e.getMessage());
diff --git a/src/java/org/apache/nutch/crawl/LinkDb.java
b/src/java/org/apache/nutch/crawl/LinkDb.java
index 2b3d2ed90..3c752ab1d 100644
--- a/src/java/org/apache/nutch/crawl/LinkDb.java
+++ b/src/java/org/apache/nutch/crawl/LinkDb.java
@@ -21,13 +21,14 @@ import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -54,7 +55,6 @@ import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.TimingUtil;
/** Maintains an inverted link map, listing incoming links for each url. */
public class LinkDb extends NutchTool implements Tool {
@@ -196,9 +196,9 @@ public class LinkDb extends NutchTool implements Tool {
Path currentLinkDb = new Path(linkDb, CURRENT_NAME);
Configuration conf = job.getConfiguration();
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("LinkDb: starting at {}", sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("LinkDb: starting");
LOG.info("LinkDb: linkdb: {}", linkDb);
LOG.info("LinkDb: URL normalize: {}", normalize);
LOG.info("LinkDb: URL filter: {}", filter);
@@ -260,8 +260,9 @@ public class LinkDb extends NutchTool implements Tool {
}
LinkDb.install(job, linkDb);
- long end = System.currentTimeMillis();
- LOG.info("LinkDb: finished at {}, elapsed: {}", sdf.format(end),
TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("LinkDb: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
private static Job createJob(Configuration config, Path linkDb,
diff --git a/src/java/org/apache/nutch/crawl/LinkDbMerger.java
b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
index f696c599e..d6a41ab48 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
@@ -18,11 +18,12 @@ package org.apache.nutch.crawl;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -41,7 +42,6 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
/**
* This tool merges several LinkDb-s into one, optionally filtering URLs
through
@@ -112,9 +112,9 @@ public class LinkDbMerger extends Configured implements
Tool {
public void merge(Path output, Path[] dbs, boolean normalize, boolean filter)
throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("LinkDb merge: starting at " + sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("LinkDb merge: starting");
Job job = createMergeJob(getConf(), output, normalize, filter);
for (int i = 0; i < dbs.length; i++) {
@@ -137,9 +137,9 @@ public class LinkDbMerger extends Configured implements
Tool {
fs.rename(FileOutputFormat.getOutputPath(job), new Path(output,
LinkDb.CURRENT_NAME));
- long end = System.currentTimeMillis();
- LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("LinkDb merge: finished, elapsed: {} ms" + stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
public static Job createMergeJob(Configuration config, Path linkDb,
diff --git a/src/java/org/apache/nutch/crawl/LinkDbReader.java
b/src/java/org/apache/nutch/crawl/LinkDbReader.java
index c307b985d..fa01f20bf 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbReader.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbReader.java
@@ -16,13 +16,15 @@
*/
package org.apache.nutch.crawl;
+import java.io.Closeable;
import java.io.IOException;
-
import java.lang.invoke.MethodHandles;
+import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import java.util.Iterator;
-// Commons Logging imports
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -46,11 +48,8 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.AbstractChecker;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
-import java.text.SimpleDateFormat;
-import java.util.Iterator;
-import java.io.Closeable;
+
/**
* Read utility for the LinkDb.
@@ -153,10 +152,9 @@ public class LinkDbReader extends AbstractChecker
implements Closeable {
public void processDumpJob(String linkdb, String output, String regex)
throws IOException, InterruptedException, ClassNotFoundException {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
-
- LOG.info("LinkDb dump: starting at {}", sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("LinkDb dump: starting");
LOG.info("LinkDb dump: db: {}", linkdb);
Path outFolder = new Path(output);
@@ -192,9 +190,9 @@ public class LinkDbReader extends AbstractChecker
implements Closeable {
throw e;
}
- long end = System.currentTimeMillis();
- LOG.info("LinkDb dump: finished at {}, elapsed: {}",
- sdf.format(end), TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("LinkDb dump: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
@Override
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java
b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 3727dcebe..92aef6f10 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -25,9 +25,11 @@ import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
+import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
+import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
@@ -454,11 +456,10 @@ public class Fetcher extends NutchTool implements Tool {
checkConfiguration();
- long start = System.currentTimeMillis();
- if (LOG.isInfoEnabled()) {
- LOG.info("Fetcher: starting at {}", TimingUtil.logDateMillis(start));
- LOG.info("Fetcher: segment: {}", segment);
- }
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("Fetcher: starting");
+ LOG.info("Fetcher: segment: {}", segment);
// set the actual time for the timelimit relative
// to the beginning of the whole job and not of a specific task
@@ -530,9 +531,9 @@ public class Fetcher extends NutchTool implements Tool {
throw e;
}
- long end = System.currentTimeMillis();
- LOG.info("Fetcher: finished at {}, elapsed: {}",
- TimingUtil.logDateMillis(end), TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("Fetcher: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
/**
diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
index ffddb1889..0321a8652 100644
--- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -18,9 +18,10 @@ package org.apache.nutch.hostdb;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -42,7 +43,6 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.SegmentReaderUtil;
import org.apache.commons.jexl3.JexlBuilder;
@@ -168,9 +168,9 @@ public class ReadHostDb extends Configured implements Tool {
// }
private void readHostDb(Path hostDb, Path output, boolean dumpHomepages,
boolean dumpHostnames, String expr) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("ReadHostDb: starting at " + sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("ReadHostDb: starting");
Configuration conf = getConf();
conf.setBoolean(HOSTDB_DUMP_HOMEPAGES, dumpHomepages);
@@ -211,8 +211,9 @@ public class ReadHostDb extends Configured implements Tool {
throw e;
}
- long end = System.currentTimeMillis();
- LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " +
TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("ReadHostDb: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
private void getHostDbRecord(Path hostDb, String host) throws Exception {
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
index ffa68d096..65e45c55d 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
@@ -17,9 +17,10 @@
package org.apache.nutch.hostdb;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
@@ -40,7 +41,6 @@ import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -73,9 +73,9 @@ public class UpdateHostDb extends Configured implements Tool {
boolean checkFailed, boolean checkNew, boolean checkKnown,
boolean force, boolean filter, boolean normalize) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("UpdateHostDb: starting at " + sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("UpdateHostDb: starting");
Job job = NutchJob.getInstance(getConf());
Configuration conf = job.getConfiguration();
@@ -149,9 +149,9 @@ public class UpdateHostDb extends Configured implements
Tool {
}
LockUtil.removeLockFile(fs, lock);
- long end = System.currentTimeMillis();
- LOG.info("UpdateHostDb: finished at " + sdf.format(end) +
- ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("UpdateHostDb: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
public static void main(String args[]) throws Exception {
diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java
b/src/java/org/apache/nutch/indexer/CleaningJob.java
index dc3ed69e4..04b9c2efa 100644
--- a/src/java/org/apache/nutch/indexer/CleaningJob.java
+++ b/src/java/org/apache/nutch/indexer/CleaningJob.java
@@ -18,7 +18,9 @@ package org.apache.nutch.indexer;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
@@ -36,7 +38,6 @@ import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -139,9 +140,9 @@ public class CleaningJob implements Tool {
public void delete(String crawldb, boolean noCommit)
throws IOException, InterruptedException, ClassNotFoundException {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("CleaningJob: starting at " + sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("CleaningJob: starting");
Job job = NutchJob.getInstance(getConf());
Configuration conf = job.getConfiguration();
@@ -173,9 +174,8 @@ public class CleaningJob implements Tool {
throw e;
}
- long end = System.currentTimeMillis();
- LOG.info("CleaningJob: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("CleaningJob: finished, elapsed: {} ms",
stopWatch.getTime(TimeUnit.MILLISECONDS));
}
@Override
diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java
b/src/java/org/apache/nutch/indexer/IndexingJob.java
index ff46bc0ef..d2115230c 100644
--- a/src/java/org/apache/nutch/indexer/IndexingJob.java
+++ b/src/java/org/apache/nutch/indexer/IndexingJob.java
@@ -19,7 +19,6 @@ package org.apache.nutch.indexer;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@@ -27,7 +26,9 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.segment.SegmentChecker;
import org.apache.hadoop.conf.Configuration;
@@ -44,7 +45,6 @@ import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -104,9 +104,9 @@ public class IndexingJob extends NutchTool implements Tool {
boolean filter, boolean normalize, boolean addBinaryContent,
boolean base64) throws IOException, InterruptedException,
ClassNotFoundException {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("Indexer: starting at {}", sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("Indexer: starting");
final Job job = NutchJob.getInstance(getConf());
job.setJobName("Indexer");
@@ -159,9 +159,9 @@ public class IndexingJob extends NutchTool implements Tool {
String.format(Locale.ROOT, "%6d", counter.getValue()),
counter.getName());
}
- long end = System.currentTimeMillis();
- LOG.info("Indexer: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("Indexer: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
} finally {
tmp.getFileSystem(conf).delete(tmp, true);
}
diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java
b/src/java/org/apache/nutch/parse/ParseSegment.java
index c4e271fee..de45c463b 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -16,6 +16,7 @@
*/
package org.apache.nutch.parse;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.nutch.crawl.CrawlDatum;
@@ -25,7 +26,6 @@ import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.StringUtil;
-import org.apache.nutch.util.TimingUtil;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
@@ -50,13 +50,12 @@ import org.apache.hadoop.io.WritableComparable;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
+import java.util.concurrent.TimeUnit;
/* Parse content in a segment. */
public class ParseSegment extends NutchTool implements Tool {
@@ -228,12 +227,10 @@ public class ParseSegment extends NutchTool implements
Tool {
return;
}
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- if (LOG.isInfoEnabled()) {
- LOG.info("ParseSegment: starting at {}", sdf.format(start));
- LOG.info("ParseSegment: segment: {}", segment);
- }
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("ParseSegment: starting");
+ LOG.info("ParseSegment: segment: {}", segment);
Job job = NutchJob.getInstance(getConf());
job.setJobName("parse " + segment);
@@ -263,9 +260,9 @@ public class ParseSegment extends NutchTool implements Tool
{
throw e;
}
- long end = System.currentTimeMillis();
- LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("ParseSegment: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
index 54cd8b8ed..4831d73f3 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
@@ -20,10 +20,11 @@ import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
@@ -31,6 +32,7 @@ import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -57,7 +59,6 @@ import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
/**
* The LinkDumper tool creates a database of node to inlink information that
can
@@ -327,9 +328,9 @@ public class LinkDumper extends Configured implements Tool {
public void dumpLinks(Path webGraphDb) throws IOException,
InterruptedException, ClassNotFoundException {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("NodeDumper: starting at " + sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("NodeDumper: starting");
Configuration conf = getConf();
FileSystem fs = webGraphDb.getFileSystem(conf);
@@ -400,9 +401,9 @@ public class LinkDumper extends Configured implements Tool {
}
fs.delete(tempInverted, true);
- long end = System.currentTimeMillis();
- LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("LinkDumper: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
index 739fe6cec..c226ad130 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
@@ -21,12 +21,12 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
+import java.util.concurrent.TimeUnit;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
@@ -35,6 +35,7 @@ import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -65,7 +66,6 @@ import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
public class LinkRank extends Configured implements Tool {
@@ -651,9 +651,9 @@ public class LinkRank extends Configured implements Tool {
public void analyze(Path webGraphDb) throws IOException,
ClassNotFoundException, InterruptedException {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("Analysis: starting at " + sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("LinkRank Analysis: starting");
// store the link rank under the webgraphdb temporarily, final scores get
// upddated into the nodedb
@@ -714,9 +714,9 @@ public class LinkRank extends Configured implements Tool {
// remove the temporary link rank folder
fs.delete(linkRank, true);
- long end = System.currentTimeMillis();
- LOG.info("Analysis: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("LinkRank Analysis: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
index ede9fa1c5..dfccccc19 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
@@ -18,7 +18,7 @@ package org.apache.nutch.scoring.webgraph;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
+import java.util.concurrent.TimeUnit;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
@@ -27,6 +27,7 @@ import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -48,7 +49,6 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
/**
@@ -293,9 +293,9 @@ public class NodeDumper extends Configured implements Tool {
boolean asEff, NameType nameType, AggrType aggrType,
boolean asSequenceFile) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("NodeDumper: starting at " + sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("NodeDumper: starting");
Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
Job dumper = NutchJob.getInstance(getConf());
@@ -357,9 +357,9 @@ public class NodeDumper extends Configured implements Tool {
LOG.error("NodeDumper job failed:", e);
throw e;
}
- long end = System.currentTimeMillis();
- LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("NodeDumper: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
index 130e1b2a1..c10a6e37b 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
@@ -18,8 +18,8 @@ package org.apache.nutch.scoring.webgraph;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
import java.util.Random;
+import java.util.concurrent.TimeUnit;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
@@ -28,6 +28,7 @@ import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -51,7 +52,6 @@ import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
/**
* Updates the score from the WebGraph node database into the crawl database.
@@ -156,9 +156,9 @@ public class ScoreUpdater extends Configured implements
Tool{
public void update(Path crawlDb, Path webGraphDb) throws IOException,
ClassNotFoundException, InterruptedException {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("ScoreUpdater: starting at " + sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("ScoreUpdater: starting");
Configuration conf = getConf();
@@ -213,9 +213,9 @@ public class ScoreUpdater extends Configured implements
Tool{
LOG.info("ScoreUpdater: installing new crawldb " + crawlDb);
CrawlDb.install(updater, crawlDb);
- long end = System.currentTimeMillis();
- LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("ScoreUpdater: finished, elapsed: {} ms ", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
index 63d0ead7d..b98329d1e 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
@@ -18,7 +18,6 @@ package org.apache.nutch.scoring.webgraph;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
@@ -26,6 +25,7 @@ import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
+import java.util.concurrent.TimeUnit;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
@@ -34,6 +34,7 @@ import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -67,7 +68,6 @@ import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
/**
@@ -518,14 +518,12 @@ public class WebGraph extends Configured implements Tool {
boolean normalize, boolean filter) throws IOException,
InterruptedException, ClassNotFoundException {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- if (LOG.isInfoEnabled()) {
- LOG.info("WebGraphDb: starting at " + sdf.format(start));
- LOG.info("WebGraphDb: webgraphdb: " + webGraphDb);
- LOG.info("WebGraphDb: URL normalize: " + normalize);
- LOG.info("WebGraphDb: URL filter: " + filter);
- }
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("WebGraphDb: starting");
+ LOG.info("WebGraphDb: webgraphdb: " + webGraphDb);
+ LOG.info("WebGraphDb: URL normalize: " + normalize);
+ LOG.info("WebGraphDb: URL filter: " + filter);
FileSystem fs = webGraphDb.getFileSystem(getConf());
@@ -715,9 +713,9 @@ public class WebGraph extends Configured implements Tool {
// remove the lock file for the webgraph
LockUtil.removeLockFile(fs, lock);
- long end = System.currentTimeMillis();
- LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("WebGraphDb: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java
b/src/java/org/apache/nutch/tools/FreeGenerator.java
index 039bccaec..e9f5c8761 100644
--- a/src/java/org/apache/nutch/tools/FreeGenerator.java
+++ b/src/java/org/apache/nutch/tools/FreeGenerator.java
@@ -18,10 +18,11 @@ package org.apache.nutch.tools;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Map.Entry;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.Path;
@@ -47,7 +48,6 @@ import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
/**
* This tool generates fetchlists (segments to be fetched) from plain text
files
@@ -180,9 +180,9 @@ public class FreeGenerator extends Configured implements
Tool {
}
}
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("FreeGenerator: starting at " + sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("FreeGenerator: starting");
Job job = NutchJob.getInstance(getConf());
Configuration conf = job.getConfiguration();
@@ -226,9 +226,9 @@ public class FreeGenerator extends Configured implements
Tool {
LOG.error("FAILED: " + StringUtils.stringifyException(e));
return -1;
}
- long end = System.currentTimeMillis();
- LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("FreeGenerator: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
return 0;
}
diff --git a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
index 4e916dbd5..825e752cc 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
@@ -21,7 +21,9 @@ import java.lang.invoke.MethodHandles;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map.Entry;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -56,7 +58,6 @@ import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
-import org.apache.nutch.util.TimingUtil;
/**
* <p>
@@ -368,10 +369,10 @@ public class ArcSegmentCreator extends Configured
implements Tool {
public void createSegments(Path arcFiles, Path segmentsOutDir)
throws IOException, InterruptedException, ClassNotFoundException {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
if (LOG.isInfoEnabled()) {
- LOG.info("ArcSegmentCreator: starting at " + sdf.format(start));
+ LOG.info("ArcSegmentCreator: starting");
LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
}
@@ -402,10 +403,9 @@ public class ArcSegmentCreator extends Configured
implements Tool {
throw e;
}
-
- long end = System.currentTimeMillis();
- LOG.info("ArcSegmentCreator: finished at " + sdf.format(end)
- + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("ArcSegmentCreator: finished, elapsed: {} ms" + stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
public static void main(String args[]) throws Exception {
diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
index cf000ba52..6d8a38557 100644
--- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
+++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
@@ -29,8 +29,10 @@ import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.UUID;
+import java.util.concurrent.TimeUnit;
import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
@@ -58,7 +60,6 @@ import org.apache.nutch.tools.WARCUtils;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -428,9 +429,9 @@ public class WARCExporter extends Configured implements
Tool {
public int generateWARC(String output, List<Path> segments,
boolean onlySuccessfulResponses, boolean includeParseData,
boolean includeParseText) throws IOException {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("WARCExporter: starting at {}", sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("WARCExporter: starting");
final Job job = NutchJob.getInstance(getConf());
job.setJobName("warc-exporter " + output);
@@ -479,9 +480,9 @@ public class WARCExporter extends Configured implements
Tool {
throw new RuntimeException(message);
}
LOG.info(job.getCounters().toString());
- long end = System.currentTimeMillis();
- LOG.info("WARCExporter: finished at {}, elapsed: {}", sdf.format(end),
- TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("WARCExporter: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
} catch (IOException | InterruptedException | ClassNotFoundException e) {
LOG.error("WARCExporter job failed: {}", e.getMessage());
return -1;
diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
index 7210ee83a..8696d2822 100644
--- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
+++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
@@ -20,7 +20,7 @@ import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
-import java.text.SimpleDateFormat;
+import java.util.concurrent.TimeUnit;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
@@ -30,6 +30,7 @@ import org.apache.commons.cli.MissingOptionException;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
@@ -127,9 +128,9 @@ public class CrawlCompletionStats extends Configured
implements Tool {
numOfReducers = Integer.parseInt(args[3]);
}
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("CrawlCompletionStats: starting at {}", sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("CrawlCompletionStats: starting");
int mode = 0;
String jobName = "CrawlCompletionStats";
@@ -180,9 +181,9 @@ public class CrawlCompletionStats extends Configured
implements Tool {
throw e;
}
- long end = System.currentTimeMillis();
- LOG.info("CrawlCompletionStats: finished at {}, elapsed: {}",
- sdf.format(end), TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("CrawlCompletionStats: finished, elapsed: {} ms",
stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
return 0;
}
diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
index 2499da0bf..0fe6c57d0 100644
--- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
+++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
@@ -16,10 +16,11 @@
*/
package org.apache.nutch.util;
-import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -37,8 +38,6 @@ import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.metadata.Nutch;
/**
@@ -86,9 +85,9 @@ public class ProtocolStatusStatistics extends Configured
implements Tool {
numOfReducers = Integer.parseInt(args[2]);
}
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("ProtocolStatistics: starting at " + sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("ProtocolStatistics: starting");
String jobName = "ProtocolStatistics";
@@ -130,9 +129,9 @@ public class ProtocolStatusStatistics extends Configured
implements Tool {
throw e;
}
- long end = System.currentTimeMillis();
- LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ",
elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("ProtocolStatistics: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
return 0;
}
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java
b/src/java/org/apache/nutch/util/SitemapProcessor.java
index 98f7df839..66fa9b0e7 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -22,7 +22,9 @@ import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.List;
import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
@@ -359,8 +361,9 @@ public class SitemapProcessor extends Configured implements
Tool {
public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean
strict, boolean filter,
boolean normalize, int threads) throws Exception {
- long start = System.currentTimeMillis();
- LOG.info("SitemapProcessor: Starting at {}", sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("SitemapProcessor: starting");
FileSystem fs = crawldb.getFileSystem(getConf());
Path old = new Path(crawldb, "old");
@@ -441,8 +444,9 @@ public class SitemapProcessor extends Configured implements
Tool {
LOG.info("SitemapProcessor: Total failed sitemap fetches: {}",
failedFetches);
LOG.info("SitemapProcessor: Total new sitemap entries added: {}",
newSitemapEntries);
- long end = System.currentTimeMillis();
- LOG.info("SitemapProcessor: Finished at {}, elapsed: {}",
sdf.format(end), TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("SitemapProcessor: finished, elapsed: {} ms",
stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
}
} catch (IOException | InterruptedException | ClassNotFoundException e) {
LOG.error("SitemapProcessor_" + crawldb.toString(), e);
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
index 638b6c94f..f77b72bc5 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -20,8 +20,9 @@ import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
-import java.text.SimpleDateFormat;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
@@ -39,7 +40,6 @@ import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -92,9 +92,9 @@ public class DomainStatistics extends Configured implements
Tool {
numOfReducers = Integer.parseInt(args[3]);
}
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- long start = System.currentTimeMillis();
- LOG.info("DomainStatistics: starting at " + sdf.format(start));
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
+ LOG.info("DomainStatistics: starting");
int mode = 0;
String jobName = "DomainStatistics";
@@ -151,9 +151,9 @@ public class DomainStatistics extends Configured implements
Tool {
throw e;
}
- long end = System.currentTimeMillis();
- LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ stopWatch.stop();
+ LOG.info("DomainStatistics: finished, elapsed: {} ms ", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
return 0;
}
diff --git
a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
index c77c67eb1..080b2e587 100644
---
a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
+++
b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
@@ -16,7 +16,6 @@
*/
package org.apache.nutch.urlfilter.api;
-// JDK imports
import java.lang.invoke.MethodHandles;
import java.io.BufferedReader;
import java.io.FileReader;
@@ -24,12 +23,13 @@ import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.time.StopWatch;
import org.junit.Assert;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-// Nutch imports
import org.apache.nutch.net.URLFilter;
/**
@@ -58,7 +58,8 @@ public abstract class RegexURLFilterBaseTest {
}
protected void bench(int loops, Reader rules, Reader urls) {
- long start = System.currentTimeMillis();
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
try {
URLFilter filter = getURLFilter(rules);
FilteredURL[] expected = readURLFile(urls);
@@ -68,8 +69,8 @@ public abstract class RegexURLFilterBaseTest {
} catch (Exception e) {
Assert.fail(e.toString());
}
- LOG.info("bench time (" + loops + ") "
- + (System.currentTimeMillis() - start) + "ms");
+ stopWatch.stop();
+ LOG.info("bench time {} loops {} ms", loops,
stopWatch.getTime(TimeUnit.MILLISECONDS));
}
protected void bench(int loops, String rulesFile, String urlsFile) {
diff --git
a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
b/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
index 1eee7183b..4952a1da4 100644
---
a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
+++
b/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
@@ -25,11 +25,13 @@ import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;
+import java.util.concurrent.TimeUnit;
import org.junit.Assert;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.util.NutchConfiguration;
@@ -104,7 +106,8 @@ public class TestRegexURLNormalizer {
}
private void bench(int loops, String scope) {
- long start = System.currentTimeMillis();
+ StopWatch stopWatch = new StopWatch();
+ stopWatch.start();
try {
NormalizedURL[] expected = (NormalizedURL[]) testData.get(scope);
if (expected == null)
@@ -115,8 +118,9 @@ public class TestRegexURLNormalizer {
} catch (Exception e) {
Assert.fail(e.toString());
}
+ stopWatch.stop();
LOG.info("bench time (" + loops + ") "
- + (System.currentTimeMillis() - start) + "ms");
+ + (stopWatch.getTime(TimeUnit.MILLISECONDS)) + "ms");
}
private static class NormalizedURL {