RegexMapper.java

sharad Tue, 21 Apr 2009 00:34:24 -0700

Author: sharad
Date: Tue Apr 21 07:34:00 2009
New Revision: 767055

URL: http://svn.apache.org/viewvc?rev=767055&view=rev
Log:
HADOOP-5697. Change org.apache.hadoop.examples.Grep to use new mapreduce api. 
Contributed by Amareshwari Sriramadasu.


Added:
    
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapreduce/lib/map/RegexMapper.java
Modified:
    hadoop/core/trunk/CHANGES.txt
    hadoop/core/trunk/src/examples/org/apache/hadoop/examples/Grep.java
    hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/lib/RegexMapper.java

Modified: hadoop/core/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hadoop/core/trunk/CHANGES.txt?rev=767055&r1=767054&r2=767055&view=diff
==============================================================================
--- hadoop/core/trunk/CHANGES.txt (original)
+++ hadoop/core/trunk/CHANGES.txt Tue Apr 21 07:34:00 2009
@@ -243,6 +243,9 @@
     HADOOP-5681. Change examples RandomWriter and RandomTextWriter to 
     use new mapreduce API. (Amareshwari Sriramadasu via sharad)
 
+    HADOOP-5697. Change org.apache.hadoop.examples.Grep to use new 
+    mapreduce api. (Amareshwari Sriramadasu via sharad)
+
   OPTIMIZATIONS
 
     HADOOP-5595. NameNode does not need to run a replicator to choose a

Modified: hadoop/core/trunk/src/examples/org/apache/hadoop/examples/Grep.java
URL: 
http://svn.apache.org/viewvc/hadoop/core/trunk/src/examples/org/apache/hadoop/examples/Grep.java?rev=767055&r1=767054&r2=767055&view=diff
==============================================================================
--- hadoop/core/trunk/src/examples/org/apache/hadoop/examples/Grep.java 
(original)
+++ hadoop/core/trunk/src/examples/org/apache/hadoop/examples/Grep.java Tue Apr 
21 07:34:00 2009
@@ -25,8 +25,14 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.mapred.lib.*;
+import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.map.InverseMapper;
+import org.apache.hadoop.mapreduce.lib.map.RegexMapper;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 
@@ -38,14 +44,19 @@
     if (args.length < 3) {
       System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
       ToolRunner.printGenericCommandUsage(System.out);
-      return -1;
+      return 2;
     }
 
     Path tempDir =
       new Path("grep-temp-"+
           Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    JobConf grepJob = new JobConf(getConf(), Grep.class);
+    Configuration conf = getConf();
+    conf.set("mapred.mapper.regex", args[2]);
+    if (args.length == 4)
+      conf.set("mapred.mapper.regex.group", args[3]);
+
+    Job grepJob = new Job(conf);
     
     try {
       
@@ -54,37 +65,34 @@
       FileInputFormat.setInputPaths(grepJob, args[0]);
 
       grepJob.setMapperClass(RegexMapper.class);
-      grepJob.set("mapred.mapper.regex", args[2]);
-      if (args.length == 4)
-        grepJob.set("mapred.mapper.regex.group", args[3]);
 
       grepJob.setCombinerClass(LongSumReducer.class);
       grepJob.setReducerClass(LongSumReducer.class);
 
       FileOutputFormat.setOutputPath(grepJob, tempDir);
-      grepJob.setOutputFormat(SequenceFileOutputFormat.class);
+      grepJob.setOutputFormatClass(SequenceFileOutputFormat.class);
       grepJob.setOutputKeyClass(Text.class);
       grepJob.setOutputValueClass(LongWritable.class);
 
-      JobClient.runJob(grepJob);
+      grepJob.waitForCompletion(true);
 
-      JobConf sortJob = new JobConf(Grep.class);
+      Job sortJob = new Job(conf);
       sortJob.setJobName("grep-sort");
 
       FileInputFormat.setInputPaths(sortJob, tempDir);
-      sortJob.setInputFormat(SequenceFileInputFormat.class);
+      sortJob.setInputFormatClass(SequenceFileInputFormat.class);
 
       sortJob.setMapperClass(InverseMapper.class);
 
       sortJob.setNumReduceTasks(1);                 // write a single file
       FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
-      sortJob.setOutputKeyComparatorClass           // sort by decreasing freq
-      (LongWritable.DecreasingComparator.class);
+      sortJob.setSortComparatorClass(          // sort by decreasing freq
+        LongWritable.DecreasingComparator.class);
 
-      JobClient.runJob(sortJob);
+      sortJob.waitForCompletion(true);
     }
     finally {
-      FileSystem.get(grepJob).delete(tempDir, true);
+      FileSystem.get(conf).delete(tempDir, true);
     }
     return 0;
   }

Modified: 
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/lib/RegexMapper.java
URL: 
http://svn.apache.org/viewvc/hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/lib/RegexMapper.java?rev=767055&r1=767054&r2=767055&view=diff
==============================================================================
--- hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/lib/RegexMapper.java 
(original)
+++ hadoop/core/trunk/src/mapred/org/apache/hadoop/mapred/lib/RegexMapper.java 
Tue Apr 21 07:34:00 2009
@@ -31,7 +31,10 @@
 import org.apache.hadoop.mapred.Reporter;
 
 
-/** A {...@link Mapper} that extracts text matching a regular expression. */
+/** A {...@link Mapper} that extracts text matching a regular expression.
+ * @deprecated Use {...@link org.apache.hadoop.mapreduce.lib.map.RegexMapper}
+ */
+...@deprecated
 public class RegexMapper<K> extends MapReduceBase
     implements Mapper<K, Text, Text, LongWritable> {
 

Added: 
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapreduce/lib/map/RegexMapper.java
URL: 
http://svn.apache.org/viewvc/hadoop/core/trunk/src/mapred/org/apache/hadoop/mapreduce/lib/map/RegexMapper.java?rev=767055&view=auto
==============================================================================
--- 
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapreduce/lib/map/RegexMapper.java
 (added)
+++ 
hadoop/core/trunk/src/mapred/org/apache/hadoop/mapreduce/lib/map/RegexMapper.java
 Tue Apr 21 07:34:00 2009
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce.lib.map;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+
+
+/** A {...@link Mapper} that extracts text matching a regular expression. */
+public class RegexMapper<K> extends Mapper<K, Text, Text, LongWritable> {
+
+  private Pattern pattern;
+  private int group;
+
+  public void setup(Context context) {
+    Configuration conf = context.getConfiguration();
+    pattern = Pattern.compile(conf.get("mapred.mapper.regex"));
+    group = conf.getInt("mapred.mapper.regex.group", 0);
+  }
+
+  public void map(K key, Text value,
+                  Context context)
+    throws IOException, InterruptedException {
+    String text = value.toString();
+    Matcher matcher = pattern.matcher(text);
+    while (matcher.find()) {
+      context.write(new Text(matcher.group(group)), new LongWritable(1));
+    }
+  }
+}

svn commit: r767055 - in /hadoop/core/trunk: CHANGES.txt src/examples/org/apache/hadoop/examples/Grep.java src/mapred/org/apache/hadoop/mapred/lib/RegexMapper.java src/mapred/org/apache/hadoop/mapreduce/lib/map/RegexMapper.java

Reply via email to