[
https://issues.apache.org/jira/browse/NUTCH-2463?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16268548#comment-16268548
]
ASF GitHub Bot commented on NUTCH-2463:
---------------------------------------
sebastian-nagel closed pull request #243: NUTCH-2463 - Enable sampling CrawlDB
URL: https://github.com/apache/nutch/pull/243
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index bfb016428..e245e380c 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -511,7 +511,7 @@ public void readUrl(String crawlDb, String url, JobConf
config)
public void processDumpJob(String crawlDb, String output,
JobConf config, String format, String regex, String status,
- Integer retry, String expr) throws IOException {
+ Integer retry, String expr, Float sample) throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb dump: starting");
LOG.info("CrawlDb db: " + crawlDb);
@@ -544,6 +544,8 @@ public void processDumpJob(String crawlDb, String output,
job.set("expr", expr);
LOG.info("CrawlDb db: expr: " + expr);
}
+ if (sample != null)
+ job.setFloat("sample", sample);
job.setMapperClass(CrawlDbDumpMapper.class);
job.setOutputKeyClass(Text.class);
@@ -562,6 +564,7 @@ public void processDumpJob(String crawlDb, String output,
String status = null;
Integer retry = null;
Expression expr = null;
+ float sample;
public void configure(JobConf job) {
if (job.get("regex", null) != null) {
@@ -573,6 +576,7 @@ public void configure(JobConf job) {
if (job.get("expr", null) != null) {
expr = JexlUtil.parseExpression(job.get("expr", null));
}
+ sample = job.getFloat("sample", 1);
}
public void close() {
@@ -582,6 +586,10 @@ public void map(Text key, CrawlDatum value,
OutputCollector<Text, CrawlDatum> output, Reporter reporter)
throws IOException {
+ // check sample
+ if (sample < 1 && Math.random() > sample) {
+ return;
+ }
// check retry
if (retry != -1) {
if (value.getRetriesSinceFetch() < retry) {
@@ -693,6 +701,7 @@ public int run(String[] args) throws IOException {
System.err
.println("\t\t[-status <status>]\tfilter records by CrawlDatum
status");
System.err.println("\t\t[-expr <expr>]\tJexl expression to evaluate for
this record");
+ System.err.println("\t\t[-sample <fraction>]\tOnly process a random
sample with this ratio");
System.err
.println("\t-url <url>\tprint information on <url> to System.out");
System.err
@@ -720,6 +729,7 @@ public int run(String[] args) throws IOException {
Integer retry = null;
String status = null;
String expr = null;
+ Float sample = null;
for (int j = i + 1; j < args.length; j++) {
if (args[j].equals("-format")) {
format = args[++j];
@@ -741,8 +751,12 @@ public int run(String[] args) throws IOException {
expr = args[++j];
i=i+2;
}
+ if (args[j].equals("-sample")) {
+ sample = Float.parseFloat(args[++j]);
+ i = i + 2;
+ }
}
- dbr.processDumpJob(crawlDb, param, job, format, regex, status, retry,
expr);
+ dbr.processDumpJob(crawlDb, param, job, format, regex, status, retry,
expr, sample);
} else if (args[i].equals("-url")) {
param = args[++i];
dbr.readUrl(crawlDb, param, job);
@@ -833,6 +847,7 @@ public Object query(Map<String, String> args, Configuration
conf, String type, S
Integer retry = null;
String status = null;
String expr = null;
+ Float sample = null;
if (args.containsKey("format")) {
format = args.get("format");
}
@@ -848,7 +863,10 @@ public Object query(Map<String, String> args,
Configuration conf, String type, S
if (args.containsKey("expr")) {
expr = args.get("expr");
}
- processDumpJob(crawlDb, output, new NutchJob(conf), format, regex,
status, retry, expr);
+ if (args.containsKey("sample")) {
+ sample = Float.parseFloat(args.get("sample"));
+ }
+ processDumpJob(crawlDb, output, new NutchJob(conf), format, regex,
status, retry, expr, sample);
File dumpFile = new File(output+"/part-00000");
return dumpFile;
}
@@ -886,4 +904,4 @@ public Object query(Map<String, String> args, Configuration
conf, String type, S
}
return results;
}
-}
\ No newline at end of file
+}
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> Enable sampling CrawlDB
> -----------------------
>
> Key: NUTCH-2463
> URL: https://issues.apache.org/jira/browse/NUTCH-2463
> Project: Nutch
> Issue Type: Improvement
> Components: crawldb
> Reporter: Yossi Tamari
> Priority: Minor
> Fix For: 1.14
>
>
> CrawlDB can grow to contain billions of records. When that happens *readdb
> -dump* is pretty useless, and *readdb -topN* can run for ages (and does not
> provide a statistically correct sample).
> We should add a parameter *-sample* to *readdb -dump* which is followed by a
> number between 0 and 1, and only that fraction of records from the CrawlDB
> will be processed.
> The sample should be statistically random, and all the other filters should
> be applied on the sampled records.
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)