Repository: hadoop Updated Branches: refs/heads/HADOOP-12930 98cfb92cf -> 31f0477b8
HADOOP-13110. add a streaming subcommand to mapred Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/31f0477b Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/31f0477b Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/31f0477b Branch: refs/heads/HADOOP-12930 Commit: 31f0477b83d600dffb675718676a0303d0da016c Parents: 98cfb92 Author: Allen Wittenauer <a...@apache.org> Authored: Fri May 6 14:00:56 2016 -0700 Committer: Allen Wittenauer <a...@apache.org> Committed: Fri May 6 14:00:56 2016 -0700 ---------------------------------------------------------------------- .../main/resources/assemblies/hadoop-tools.xml | 8 +++ .../apache/hadoop/streaming/DumpTypedBytes.java | 3 +- .../hadoop/streaming/HadoopStreaming.java | 3 +- .../apache/hadoop/streaming/LoadTypedBytes.java | 3 +- .../src/main/shellprofile.d/hadoop-streaming.sh | 55 ++++++++++++++++++++ .../src/site/markdown/HadoopStreaming.md.vm | 30 +++++------ 6 files changed, 81 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/31f0477b/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml ---------------------------------------------------------------------- diff --git a/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml b/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml index 8606e23..3909277 100644 --- a/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml +++ b/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml @@ -148,6 +148,14 @@ </includes> </fileSet> <fileSet> + <directory>../hadoop-streaming/src/main/shellprofile.d</directory> + <includes> + <include>*</include> + </includes> + <outputDirectory>/libexec/shellprofile.d</outputDirectory> + <fileMode>0755</fileMode> + </fileSet> + <fileSet> <directory>../hadoop-sls/target</directory> <outputDirectory>/share/hadoop/${hadoop.component}/sources</outputDirectory> <includes> http://git-wip-us.apache.org/repos/asf/hadoop/blob/31f0477b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java index 5a07cc3..ffddc7c 100644 --- a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java +++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java @@ -91,8 +91,7 @@ public class DumpTypedBytes implements Tool { } private void printUsage() { - System.out.println("Usage: $HADOOP_HOME/bin/hadoop jar hadoop-streaming.jar" - + " dumptb <glob-pattern>"); + System.out.println("Usage: mapred streaming dumptb <glob-pattern>"); System.out.println(" Dumps all files that match the given pattern to " + "standard output as typed bytes."); System.out.println(" The files can be text or sequence files"); http://git-wip-us.apache.org/repos/asf/hadoop/blob/31f0477b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java index eabf46c..92f9d03 100644 --- a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java +++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java @@ -56,8 +56,7 @@ public class HadoopStreaming { } private static void printUsage() { - System.out.println("Usage: $HADOOP_HOME/bin/hadoop jar hadoop-streaming.jar" - + " [options]"); + System.out.println("Usage: mapred streaming [options]"); System.out.println("Options:"); System.out.println(" dumptb <glob-pattern> Dumps all files that match the" + " given pattern to "); http://git-wip-us.apache.org/repos/asf/hadoop/blob/31f0477b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java index a7a001c..838cfa1 100644 --- a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java +++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java @@ -89,8 +89,7 @@ public class LoadTypedBytes implements Tool { } private void printUsage() { - System.out.println("Usage: $HADOOP_HOME/bin/hadoop jar hadoop-streaming.jar" - + " loadtb <path>"); + System.out.println("Usage: mapred streaming loadtb <path>"); System.out.println(" Reads typed bytes from standard input" + " and stores them in a sequence file in"); System.out.println(" the specified path"); http://git-wip-us.apache.org/repos/asf/hadoop/blob/31f0477b/hadoop-tools/hadoop-streaming/src/main/shellprofile.d/hadoop-streaming.sh ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-streaming/src/main/shellprofile.d/hadoop-streaming.sh b/hadoop-tools/hadoop-streaming/src/main/shellprofile.d/hadoop-streaming.sh new file mode 100755 index 0000000..cca016d --- /dev/null +++ b/hadoop-tools/hadoop-streaming/src/main/shellprofile.d/hadoop-streaming.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if ! declare -f mapred_subcommand_streaming >/dev/null 2>/dev/null; then + + if [[ "${HADOOP_SHELL_EXECNAME}" = mapred ]]; then + hadoop_add_subcommand "streaming" "launch a mapreduce streaming job" + fi + +## @description streaming command for mapred +## @audience public +## @stability stable +## @replaceable yes +function mapred_subcommand_streaming +{ + declare jarname + declare oldifs + + # shellcheck disable=SC2034 + HADOOP_CLASSNAME=org.apache.hadoop.util.RunJar + hadoop_add_to_classpath_tools hadoop-streaming + + # locate the streaming jar so we have something to + # give to RunJar + oldifs=${IFS} + IFS=: + for jarname in ${CLASSPATH}; do + if [[ "${jarname}" =~ hadoop-streaming-[0-9] ]]; then + HADOOP_SUBCMD_ARGS=("${jarname}" "${HADOOP_SUBCMD_ARGS[@]}") + break + fi + done + + IFS=${oldifs} + + hadoop_debug "Appending HADOOP_CLIENT_OPTS onto HADOOP_OPTS" + HADOOP_OPTS="${HADOOP_OPTS} ${HADOOP_CLIENT_OPTS}" + +} + +fi http://git-wip-us.apache.org/repos/asf/hadoop/blob/31f0477b/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm b/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm index cc8ed69..072a68b 100644 --- a/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm +++ b/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm @@ -62,7 +62,7 @@ Hadoop Streaming Hadoop streaming is a utility that comes with the Hadoop distribution. The utility allows you to create and run Map/Reduce jobs with any executable or script as the mapper and/or the reducer. For example: - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -input myInputDirs \ -output myOutputDir \ -mapper /bin/cat \ @@ -88,7 +88,7 @@ Streaming supports streaming command options as well as [generic command options **Note:** Be sure to place the generic options before the streaming options, otherwise the command will fail. For an example, see [Making Archives Available to Tasks](#Making_Archives_Available_to_Tasks). - hadoop command [genericOptions] [streamingOptions] + mapred streaming [genericOptions] [streamingOptions] The Hadoop streaming command options are listed here: @@ -115,7 +115,7 @@ $H3 Specifying a Java Class as the Mapper/Reducer You can supply a Java class as the mapper and/or the reducer. - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -input myInputDirs \ -output myOutputDir \ -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \ @@ -128,7 +128,7 @@ $H3 Packaging Files With Job Submissions You can specify any executable as the mapper and/or the reducer. The executables do not need to pre-exist on the machines in the cluster; however, if they don't, you will need to use "-file" option to tell the framework to pack your executable files as a part of job submission. For example: - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -input myInputDirs \ -output myOutputDir \ -mapper myPythonScript.py \ @@ -139,7 +139,7 @@ The above example specifies a user defined Python executable as the mapper. The In addition to executable files, you can also package other auxiliary files (such as dictionaries, configuration files, etc) that may be used by the mapper and/or the reducer. For example: - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -input myInputDirs \ -output myOutputDir \ -mapper myPythonScript.py \ @@ -216,7 +216,7 @@ $H4 Specifying the Number of Reducers To specify the number of reducers, for example two, use: - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -D mapreduce.job.reduces=2 \ -input myInputDirs \ -output myOutputDir \ @@ -229,7 +229,7 @@ As noted earlier, when the Map/Reduce framework reads a line from the stdout of However, you can customize this default. You can specify a field separator other than the tab character (the default), and you can specify the nth (n \>= 1) character rather than the first character in a line (the default) as the separator between the key and value. For example: - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -D stream.map.output.field.separator=. \ -D stream.num.map.output.key.fields=4 \ -input myInputDirs \ @@ -279,7 +279,7 @@ User can specify a different symlink name for -archives using \#. In this example, the input.txt file has two lines specifying the names of the two files: cachedir.jar/cache.txt and cachedir.jar/cache2.txt. "cachedir.jar" is a symlink to the archived directory, which has the files "cache.txt" and "cache2.txt". - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -archives 'hdfs://hadoop-nn1.example.com/user/me/samples/cachefile/cachedir.jar' \ -D mapreduce.job.maps=1 \ -D mapreduce.job.reduces=1 \ @@ -325,7 +325,7 @@ $H3 Hadoop Partitioner Class Hadoop has a library class, [KeyFieldBasedPartitioner](../api/org/apache/hadoop/mapred/lib/KeyFieldBasedPartitioner.html), that is useful for many applications. This class allows the Map/Reduce framework to partition the map outputs based on certain key fields, not the whole keys. For example: - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -D stream.map.output.field.separator=. \ -D stream.num.map.output.key.fields=4 \ -D map.output.key.field.separator=. \ @@ -375,7 +375,7 @@ $H3 Hadoop Comparator Class Hadoop has a library class, [KeyFieldBasedComparator](../api/org/apache/hadoop/mapreduce/lib/partition/KeyFieldBasedComparator.html), that is useful for many applications. This class provides a subset of features provided by the Unix/GNU Sort. For example: - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -D mapreduce.job.output.key.comparator.class=org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedComparator \ -D stream.map.output.field.separator=. \ -D stream.num.map.output.key.fields=4 \ @@ -411,7 +411,7 @@ Hadoop has a library package called [Aggregate](../api/org/apache/hadoop/mapred/ To use Aggregate, simply specify "-reducer aggregate": - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -input myInputDirs \ -output myOutputDir \ -mapper myAggregatorForKeyCount.py \ @@ -444,7 +444,7 @@ $H3 Hadoop Field Selection Class Hadoop has a library class, [FieldSelectionMapReduce](../api/org/apache/hadoop/mapred/lib/FieldSelectionMapReduce.html), that effectively allows you to process text data like the unix "cut" utility. The map function defined in the class treats each input key/value pair as a list of fields. You can specify the field separator (the default is the tab character). You can select an arbitrary list of fields as the map output key, and an arbitrary list of fields as the map output value. Similarly, the reduce function defined in the class treats each input key/value pair as a list of fields. You can select an arbitrary list of fields as the reduce output key, and an arbitrary list of fields as the reduce output value. For example: - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -D mapreduce.map.output.key.field.separator=. \ -D mapreduce.partition.keypartitioner.options=-k1,2 \ -D mapreduce.fieldsel.data.field.separator=. \ @@ -495,7 +495,7 @@ Using an alias will not work, but variable substitution is allowed as shown in t charlie 80 dan 75 - $ c2='cut -f2'; hadoop jar hadoop-streaming-${project.version}.jar \ + $ c2='cut -f2'; mapred streaming \ -D mapreduce.job.name='Experiment' \ -input /user/me/samples/student_marks \ -output /user/me/samples/student_out \ @@ -525,7 +525,7 @@ $H3 How do I specify multiple input directories? You can specify multiple input directories with multiple '-input' options: - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -input '/user/foo/dir1' -input '/user/foo/dir2' \ (rest of the command) @@ -541,7 +541,7 @@ $H3 How do I parse XML documents using streaming? You can use the record reader StreamXmlRecordReader to process XML documents. - hadoop jar hadoop-streaming-${project.version}.jar \ + mapred streaming \ -inputreader "StreamXmlRecord,begin=BEGIN_STRING,end=END_STRING" \ (rest of the command) --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org