This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new be3aea1 NUTCH-2367 Get single record from HostDB be3aea1 is described below commit be3aea1410835b34cfacdff7c3def9fb01a83e76 Author: Markus Jelsma <mar...@apache.org> AuthorDate: Thu Mar 16 11:40:02 2017 +0100 NUTCH-2367 Get single record from HostDB --- src/java/org/apache/nutch/hostdb/ReadHostDb.java | 39 ++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java index 5b08504..17e135a 100644 --- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java +++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java @@ -30,9 +30,11 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; @@ -200,6 +202,29 @@ public class ReadHostDb extends Configured implements Tool { long end = System.currentTimeMillis(); LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } + + private void getHostDbRecord(Path hostDb, String host) throws Exception { + Configuration conf = getConf(); + SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(conf, hostDb); + + Class<?> keyClass = readers[0].getKeyClass(); + Class<?> valueClass = readers[0].getValueClass(); + + if (!keyClass.getName().equals("org.apache.hadoop.io.Text")) + throw new IOException("Incompatible key (" + keyClass.getName() + ")"); + + Text key = (Text) keyClass.newInstance(); + HostDatum value = (HostDatum) valueClass.newInstance(); + + for (int i = 0; i < readers.length; i++) { + while (readers[i].next(key, value)) { + if (host.equals(key.toString())) { + System.out.println(value.toString()); + } + } + readers[i].close(); + } + } public static void main(String args[]) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new ReadHostDb(), args); @@ -208,13 +233,14 @@ public class ReadHostDb extends Configured implements Tool { public int run(String[] args) throws Exception { if (args.length < 2) { - System.err.println("Usage: ReadHostDb <hostdb> <output> [-dumpHomepages | -dumpHostnames | -expr <expr.>]"); + System.err.println("Usage: ReadHostDb <hostdb> [-get <url>] [<output> [-dumpHomepages | -dumpHostnames | -expr <expr.>]]"); return -1; } boolean dumpHomepages = false; boolean dumpHostnames = false; String expr = null; + String get = null; for (int i = 0; i < args.length; i++) { if (args[i].equals("-dumpHomepages")) { @@ -225,6 +251,11 @@ public class ReadHostDb extends Configured implements Tool { LOG.info("ReadHostDb: dumping hostnames"); dumpHostnames = true; } + if (args[i].equals("-get")) { + get = args[i + 1]; + LOG.info("ReadHostDb: get: "+ get); + i++; + } if (args[i].equals("-expr")) { expr = args[i + 1]; LOG.info("ReadHostDb: evaluating expression: " + expr); @@ -233,7 +264,11 @@ public class ReadHostDb extends Configured implements Tool { } try { - readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr); + if (get != null) { + getHostDbRecord(new Path(args[0], "current"), get); + } else { + readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr); + } return 0; } catch (Exception e) { LOG.error("ReadHostDb: " + StringUtils.stringifyException(e)); -- To stop receiving notification emails like this one, please contact ['"commits@nutch.apache.org" <commits@nutch.apache.org>'].