CRUNCH-32: Clean up namespaces. Rename Maven module "scrunch" to "crunch-scrunch". Change artifactId of "scrunch" to "crunch-scrunch". Move package "org.apache.scrunch" to "org.apache.crunch.scrunch". Rename Maven module "examples" to "crunch-examples".
Signed-off-by: jwills <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/incubator-crunch/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-crunch/commit/dfd28922 Tree: http://git-wip-us.apache.org/repos/asf/incubator-crunch/tree/dfd28922 Diff: http://git-wip-us.apache.org/repos/asf/incubator-crunch/diff/dfd28922 Branch: refs/heads/master Commit: dfd28922d6c5740db3d5170d94db546d22077ba4 Parents: a92a523 Author: Matthias Friedrich <[email protected]> Authored: Sun Aug 5 20:03:36 2012 +0200 Committer: jwills <[email protected]> Committed: Tue Aug 7 09:17:40 2012 -0700 ---------------------------------------------------------------------- crunch-examples/pom.xml | 78 + crunch-examples/src/main/assembly/hadoop-job.xml | 41 + .../apache/crunch/examples/AverageBytesByIP.java | 134 + .../org/apache/crunch/examples/TotalBytesByIP.java | 106 + .../java/org/apache/crunch/examples/WordCount.java | 76 + crunch-examples/src/site/markdown/index.md | 20 + crunch-examples/src/site/site.xml | 34 + crunch-scrunch/build.sbt | 51 + crunch-scrunch/pom.xml | 199 + crunch-scrunch/src/it/resources/log4j.properties | 28 + crunch-scrunch/src/it/resources/maugham.txt |29112 +++++++++++++++ crunch-scrunch/src/it/resources/shakes.txt | 3667 ++ crunch-scrunch/src/it/resources/urls.txt | 11 + .../org/apache/crunch/scrunch/CogroupTest.scala | 42 + .../scala/org/apache/crunch/scrunch/JoinTest.scala | 45 + .../apache/crunch/scrunch/PageRankClassTest.scala | 119 + .../org/apache/crunch/scrunch/PageRankTest.scala | 105 + .../apache/crunch/scrunch/PipelineAppTest.scala | 49 + .../scala/org/apache/crunch/scrunch/TopTest.scala | 42 + .../org/apache/crunch/scrunch/UnionTest.scala | 52 + .../org/apache/crunch/scrunch/WordCountTest.scala | 42 + .../scrunch/interpreter/InterpreterJarTest.scala | 69 + crunch-scrunch/src/main/assembly/release.xml | 93 + crunch-scrunch/src/main/conf/log4j.properties | 24 + .../src/main/examples/ClassyPageRank.scala | 71 + crunch-scrunch/src/main/examples/PageRank.scala | 61 + crunch-scrunch/src/main/examples/WordCount.scala | 27 + .../crunch/scrunch/ScalaReflectDataFactory.java | 41 + .../crunch/scrunch/ScalaSafeReflectData.java | 292 + .../scrunch/ScalaSafeReflectDatumReader.java | 122 + .../scrunch/ScalaSafeReflectDatumWriter.java | 68 + .../org/apache/crunch/scrunch/Conversions.scala | 147 + .../apache/crunch/scrunch/EmbeddedPipeline.scala | 47 + .../crunch/scrunch/EmbeddedPipelineLike.scala | 127 + .../main/scala/org/apache/crunch/scrunch/IO.scala | 50 + .../main/scala/org/apache/crunch/scrunch/Mem.scala | 88 + .../org/apache/crunch/scrunch/PCollection.scala | 118 + .../apache/crunch/scrunch/PCollectionLike.scala | 48 + .../org/apache/crunch/scrunch/PGroupedTable.scala | 92 + .../scala/org/apache/crunch/scrunch/PTable.scala | 191 + .../org/apache/crunch/scrunch/PTypeFamily.scala | 127 + .../scala/org/apache/crunch/scrunch/Pipeline.scala | 164 + .../org/apache/crunch/scrunch/PipelineApp.scala | 64 + .../org/apache/crunch/scrunch/PipelineHelper.scala | 74 + .../org/apache/crunch/scrunch/PipelineLike.scala | 91 + .../scrunch/interpreter/InterpreterRunner.scala | 208 + crunch-scrunch/src/main/scripts/imports.scala | 19 + crunch-scrunch/src/main/scripts/scrunch | 163 + crunch-scrunch/src/main/scripts/scrunch-job.py | 133 + crunch-scrunch/src/site/markdown/index.md | 20 + crunch-scrunch/src/site/site.xml | 34 + examples/pom.xml | 78 - examples/src/main/assembly/hadoop-job.xml | 41 - .../apache/crunch/examples/AverageBytesByIP.java | 134 - .../org/apache/crunch/examples/TotalBytesByIP.java | 106 - .../java/org/apache/crunch/examples/WordCount.java | 76 - examples/src/site/markdown/index.md | 20 - examples/src/site/site.xml | 34 - pom.xml | 4 +- scrunch/build.sbt | 51 - scrunch/pom.xml | 199 - scrunch/src/it/resources/log4j.properties | 29 - scrunch/src/it/resources/maugham.txt |29112 --------------- scrunch/src/it/resources/shakes.txt | 3667 -- scrunch/src/it/resources/urls.txt | 11 - .../it/scala/org/apache/scrunch/CogroupTest.scala | 42 - .../src/it/scala/org/apache/scrunch/JoinTest.scala | 45 - .../org/apache/scrunch/PageRankClassTest.scala | 119 - .../it/scala/org/apache/scrunch/PageRankTest.scala | 105 - .../scala/org/apache/scrunch/PipelineAppTest.scala | 49 - .../src/it/scala/org/apache/scrunch/TopTest.scala | 42 - .../it/scala/org/apache/scrunch/UnionTest.scala | 52 - .../scala/org/apache/scrunch/WordCountTest.scala | 42 - .../scrunch/interpreter/InterpreterJarTest.scala | 69 - scrunch/src/main/assembly/release.xml | 93 - scrunch/src/main/conf/log4j.properties | 24 - scrunch/src/main/examples/ClassyPageRank.scala | 71 - scrunch/src/main/examples/PageRank.scala | 61 - scrunch/src/main/examples/WordCount.scala | 27 - .../apache/scrunch/ScalaReflectDataFactory.java | 41 - .../org/apache/scrunch/ScalaSafeReflectData.java | 292 - .../scrunch/ScalaSafeReflectDatumReader.java | 122 - .../scrunch/ScalaSafeReflectDatumWriter.java | 68 - .../scala/org/apache/scrunch/Conversions.scala | 147 - .../org/apache/scrunch/EmbeddedPipeline.scala | 47 - .../org/apache/scrunch/EmbeddedPipelineLike.scala | 127 - scrunch/src/main/scala/org/apache/scrunch/IO.scala | 50 - .../src/main/scala/org/apache/scrunch/Mem.scala | 88 - .../scala/org/apache/scrunch/PCollection.scala | 118 - .../scala/org/apache/scrunch/PCollectionLike.scala | 48 - .../scala/org/apache/scrunch/PGroupedTable.scala | 92 - .../src/main/scala/org/apache/scrunch/PTable.scala | 191 - .../scala/org/apache/scrunch/PTypeFamily.scala | 127 - .../main/scala/org/apache/scrunch/Pipeline.scala | 164 - .../scala/org/apache/scrunch/PipelineApp.scala | 64 - .../scala/org/apache/scrunch/PipelineHelper.scala | 74 - .../scala/org/apache/scrunch/PipelineLike.scala | 91 - .../scrunch/interpreter/InterpreterRunner.scala | 208 - scrunch/src/main/scripts/imports.scala | 19 - scrunch/src/main/scripts/scrunch | 163 - scrunch/src/main/scripts/scrunch-job.py | 133 - scrunch/src/site/markdown/index.md | 20 - scrunch/src/site/site.xml | 34 - src/site/markdown/scrunch.md | 7 +- 104 files changed, 36931 insertions(+), 36933 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/pom.xml ---------------------------------------------------------------------- diff --git a/crunch-examples/pom.xml b/crunch-examples/pom.xml new file mode 100644 index 0000000..4b3113f --- /dev/null +++ b/crunch-examples/pom.xml @@ -0,0 +1,78 @@ +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.crunch</groupId> + <artifactId>crunch-parent</artifactId> + <version>0.3.0-incubating-SNAPSHOT</version> + </parent> + + <artifactId>crunch-examples</artifactId> + <name>Crunch Examples</name> + + <dependencies> + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-client</artifactId> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>org.apache.crunch</groupId> + <artifactId>crunch</artifactId> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-assembly-plugin</artifactId> + <configuration> + <descriptors> + <descriptor>src/main/assembly/hadoop-job.xml</descriptor> + </descriptors> + <archive> + <manifest> + <mainClass>org.apache.crunch.examples.WordCount</mainClass> + </manifest> + </archive> + </configuration> + <executions> + <execution> + <id>make-assembly</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> + +</project> http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/main/assembly/hadoop-job.xml ---------------------------------------------------------------------- diff --git a/crunch-examples/src/main/assembly/hadoop-job.xml b/crunch-examples/src/main/assembly/hadoop-job.xml new file mode 100644 index 0000000..366bb33 --- /dev/null +++ b/crunch-examples/src/main/assembly/hadoop-job.xml @@ -0,0 +1,41 @@ +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> +<assembly> + <id>job</id> + <formats> + <format>jar</format> + </formats> + <includeBaseDirectory>false</includeBaseDirectory> + <dependencySets> + <dependencySet> + <unpack>false</unpack> + <scope>runtime</scope> + <outputDirectory>lib</outputDirectory> + <excludes> + <exclude>${groupId}:${artifactId}</exclude> + </excludes> + </dependencySet> + <dependencySet> + <unpack>true</unpack> + <includes> + <include>${groupId}:${artifactId}</include> + </includes> + </dependencySet> + </dependencySets> +</assembly> http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/main/java/org/apache/crunch/examples/AverageBytesByIP.java ---------------------------------------------------------------------- diff --git a/crunch-examples/src/main/java/org/apache/crunch/examples/AverageBytesByIP.java b/crunch-examples/src/main/java/org/apache/crunch/examples/AverageBytesByIP.java new file mode 100644 index 0000000..868e38a --- /dev/null +++ b/crunch-examples/src/main/java/org/apache/crunch/examples/AverageBytesByIP.java @@ -0,0 +1,134 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.crunch.examples; + +import java.io.Serializable; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.crunch.CombineFn; +import org.apache.crunch.DoFn; +import org.apache.crunch.Emitter; +import org.apache.crunch.MapFn; +import org.apache.crunch.PCollection; +import org.apache.crunch.PTable; +import org.apache.crunch.Pair; +import org.apache.crunch.Pipeline; +import org.apache.crunch.impl.mr.MRPipeline; +import org.apache.crunch.types.writable.Writables; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +@SuppressWarnings("serial") +public class AverageBytesByIP extends Configured implements Tool, Serializable { + static enum COUNTERS { + NO_MATCH, + CORRUPT_SIZE + } + + static final String logRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+) \"([^\"]+)\" \"([^\"]+)\""; + + public int run(String[] args) throws Exception { + if (args.length != 2) { + System.err.println(); + System.err.println("Two and only two arguments are accepted."); + System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output"); + System.err.println(); + GenericOptionsParser.printGenericCommandUsage(System.err); + return 1; + } + // Create an object to coordinate pipeline creation and execution. + Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf()); + // Reference a given text file as a collection of Strings. + PCollection<String> lines = pipeline.readTextFile(args[0]); + + // Combiner used for summing up response size and count + CombineFn<String, Pair<Long, Long>> stringPairOfLongsSumCombiner = CombineFn.pairAggregator(CombineFn.SUM_LONGS, + CombineFn.SUM_LONGS); + + // Table of (ip, sum(response size), count) + PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines + .parallelDo(extractResponseSize, + Writables.tableOf(Writables.strings(), Writables.pairs(Writables.longs(), Writables.longs()))).groupByKey() + .combineValues(stringPairOfLongsSumCombiner); + + // Calculate average response size by ip address + PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage, + Writables.tableOf(Writables.strings(), Writables.doubles())); + + // write the result to a text file + pipeline.writeTextFile(avgs, args[1]); + // Execute the pipeline as a MapReduce. + pipeline.done(); + return 0; + } + + // Function to calculate the average response size for a given ip address + // + // Input: (ip, sum(response size), count) + // Output: (ip, average response size) + MapFn<Pair<String, Pair<Long, Long>>, Pair<String, Double>> calulateAverage = new MapFn<Pair<String, Pair<Long, Long>>, Pair<String, Double>>() { + @Override + public Pair<String, Double> map(Pair<String, Pair<Long, Long>> arg) { + Pair<Long, Long> sumCount = arg.second(); + double avg = 0; + if (sumCount.second() > 0) { + avg = (double) sumCount.first() / (double) sumCount.second(); + } + return Pair.of(arg.first(), avg); + } + }; + + // Function to parse apache log records + // Given a standard apache log line, extract the ip address and + // response size. Outputs ip and the response size and a count (1) so that + // a combiner can be used. + // + // Input: 55.1.3.2 ...... 200 512 .... + // Output: (55.1.3.2, (512, 1)) + DoFn<String, Pair<String, Pair<Long, Long>>> extractResponseSize = new DoFn<String, Pair<String, Pair<Long, Long>>>() { + transient Pattern pattern; + + public void initialize() { + pattern = Pattern.compile(logRegex); + } + + public void process(String line, Emitter<Pair<String, Pair<Long, Long>>> emitter) { + Matcher matcher = pattern.matcher(line); + if (matcher.matches()) { + try { + Long responseSize = Long.parseLong(matcher.group(7)); + Pair<Long, Long> sumCount = Pair.of(responseSize, 1L); + String remoteAddr = matcher.group(1); + emitter.emit(Pair.of(remoteAddr, sumCount)); + } catch (NumberFormatException e) { + this.getCounter(COUNTERS.CORRUPT_SIZE).increment(1); + } + } else { + this.getCounter(COUNTERS.NO_MATCH).increment(1); + } + } + }; + + public static void main(String[] args) throws Exception { + ToolRunner.run(new Configuration(), new AverageBytesByIP(), args); + } +} http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/main/java/org/apache/crunch/examples/TotalBytesByIP.java ---------------------------------------------------------------------- diff --git a/crunch-examples/src/main/java/org/apache/crunch/examples/TotalBytesByIP.java b/crunch-examples/src/main/java/org/apache/crunch/examples/TotalBytesByIP.java new file mode 100644 index 0000000..1953e3a --- /dev/null +++ b/crunch-examples/src/main/java/org/apache/crunch/examples/TotalBytesByIP.java @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.crunch.examples; + +import java.io.Serializable; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.crunch.CombineFn; +import org.apache.crunch.DoFn; +import org.apache.crunch.Emitter; +import org.apache.crunch.PCollection; +import org.apache.crunch.PTable; +import org.apache.crunch.Pair; +import org.apache.crunch.Pipeline; +import org.apache.crunch.impl.mr.MRPipeline; +import org.apache.crunch.types.writable.Writables; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +@SuppressWarnings("serial") +public class TotalBytesByIP extends Configured implements Tool, Serializable { + static enum COUNTERS { + NO_MATCH, + CORRUPT_SIZE + } + + static final String logRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+) \"([^\"]+)\" \"([^\"]+)\""; + + public int run(String[] args) throws Exception { + if (args.length != 2) { + System.err.println(); + System.err.println("Two and only two arguments are accepted."); + System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output"); + System.err.println(); + GenericOptionsParser.printGenericCommandUsage(System.err); + return 1; + } + // Create an object to coordinate pipeline creation and execution. + Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf()); + // Reference a given text file as a collection of Strings. + PCollection<String> lines = pipeline.readTextFile(args[0]); + + // Combiner used for summing up response size + CombineFn<String, Long> longSumCombiner = CombineFn.SUM_LONGS(); + + // Table of (ip, sum(response size)) + PTable<String, Long> ipAddrResponseSize = lines + .parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs())).groupByKey() + .combineValues(longSumCombiner); + + pipeline.writeTextFile(ipAddrResponseSize, args[1]); + // Execute the pipeline as a MapReduce. + pipeline.done(); + return 0; + } + + // Function to parse apache log records + // Given a standard apache log line, extract the ip address and + // request size. Outputs the ip and response size. + // + // Input: 55.1.3.2 ...... 200 512 .... + // Output: (55.1.3.2, 512) + DoFn<String, Pair<String, Long>> extractIPResponseSize = new DoFn<String, Pair<String, Long>>() { + transient Pattern pattern; + + public void initialize() { + pattern = Pattern.compile(logRegex); + } + + public void process(String line, Emitter<Pair<String, Long>> emitter) { + Matcher matcher = pattern.matcher(line); + if (matcher.matches()) { + try { + Long requestSize = Long.parseLong(matcher.group(7)); + String remoteAddr = matcher.group(1); + emitter.emit(Pair.of(remoteAddr, requestSize)); + } catch (NumberFormatException e) { + // corrupt line, we should increment counter + } + } + } + }; + + public static void main(String[] args) throws Exception { + ToolRunner.run(new Configuration(), new TotalBytesByIP(), args); + } +} http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/main/java/org/apache/crunch/examples/WordCount.java ---------------------------------------------------------------------- diff --git a/crunch-examples/src/main/java/org/apache/crunch/examples/WordCount.java b/crunch-examples/src/main/java/org/apache/crunch/examples/WordCount.java new file mode 100644 index 0000000..e4ce25b --- /dev/null +++ b/crunch-examples/src/main/java/org/apache/crunch/examples/WordCount.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.crunch.examples; + +import java.io.Serializable; + +import org.apache.crunch.DoFn; +import org.apache.crunch.Emitter; +import org.apache.crunch.PCollection; +import org.apache.crunch.PTable; +import org.apache.crunch.Pipeline; +import org.apache.crunch.impl.mr.MRPipeline; +import org.apache.crunch.types.writable.Writables; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +public class WordCount extends Configured implements Tool, Serializable { + public int run(String[] args) throws Exception { + if (args.length != 3) { + System.err.println(); + System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output"); + System.err.println(); + GenericOptionsParser.printGenericCommandUsage(System.err); + return 1; + } + // Create an object to coordinate pipeline creation and execution. + Pipeline pipeline = new MRPipeline(WordCount.class, getConf()); + // Reference a given text file as a collection of Strings. + PCollection<String> lines = pipeline.readTextFile(args[1]); + + // Define a function that splits each line in a PCollection of Strings into + // a + // PCollection made up of the individual words in the file. + PCollection<String> words = lines.parallelDo(new DoFn<String, String>() { + public void process(String line, Emitter<String> emitter) { + for (String word : line.split("\\s+")) { + emitter.emit(word); + } + } + }, Writables.strings()); // Indicates the serialization format + + // The count method applies a series of Crunch primitives and returns + // a map of the unique words in the input PCollection to their counts. + // Best of all, the count() function doesn't need to know anything about + // the kind of data stored in the input PCollection. + PTable<String, Long> counts = words.count(); + + // Instruct the pipeline to write the resulting counts to a text file. + pipeline.writeTextFile(counts, args[2]); + // Execute the pipeline as a MapReduce. + pipeline.done(); + return 0; + } + + public static void main(String[] args) throws Exception { + ToolRunner.run(new Configuration(), new WordCount(), args); + } +} http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/site/markdown/index.md ---------------------------------------------------------------------- diff --git a/crunch-examples/src/site/markdown/index.md b/crunch-examples/src/site/markdown/index.md new file mode 100644 index 0000000..838e3ae --- /dev/null +++ b/crunch-examples/src/site/markdown/index.md @@ -0,0 +1,20 @@ +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> +# Apache Crunch - Examples +--- http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/site/site.xml ---------------------------------------------------------------------- diff --git a/crunch-examples/src/site/site.xml b/crunch-examples/src/site/site.xml new file mode 100644 index 0000000..73fbd17 --- /dev/null +++ b/crunch-examples/src/site/site.xml @@ -0,0 +1,34 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="${project.name}" + xmlns="http://maven.apache.org/DECORATION/1.3.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/DECORATION/1.3.0 + http://maven.apache.org/xsd/decoration-1.3.0.xsd"> + + <body> + <!-- Note: Breadcrumbs for Doxia's Markdown parser are currently broken, + see https://jira.codehaus.org/browse/DOXIA-472 --> + <breadcrumbs> + <item name="Apache" href="http://www.apache.org/index.html" /> + <item name="Crunch" href="../index.html"/> + </breadcrumbs> + + </body> + +</project> http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-scrunch/build.sbt ---------------------------------------------------------------------- diff --git a/crunch-scrunch/build.sbt b/crunch-scrunch/build.sbt new file mode 100644 index 0000000..7d4a6f8 --- /dev/null +++ b/crunch-scrunch/build.sbt @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +name := "scrunch" + +version := "0.2.0" + +scalaVersion := "2.9.2" + +resolvers ++= Seq( + "Local Maven Repository" at "file://"+Path.userHome.absolutePath+"/.m2/repository", + "Hadoop Releases" at "https://repository.cloudera.com/content/repositories/releases/" +) + +libraryDependencies ++= Seq( + "org.apache.crunch" % "crunch" % "0.3.0" excludeAll( + ExclusionRule(organization = "com.sun.jdmk"), + ExclusionRule(organization = "com.sun.jmx"), + ExclusionRule(organization = "javax.jms") + ), + "org.apache.hadoop" % "hadoop-client" % "0.20.2-cdh3u4" % "provided" excludeAll( + ExclusionRule(organization = "com.sun.jdmk"), + ExclusionRule(organization = "com.sun.jmx"), + ExclusionRule(organization = "javax.jms") + ), + "org.apache.hbase" % "hbase" % "0.90.6-cdh3u4" % "provided" excludeAll( + ExclusionRule(organization = "org.apache.hadoop"), + ExclusionRule(organization = "commons-logging"), + ExclusionRule(organization = "com.google.guava"), + ExclusionRule(organization = "log4j"), + ExclusionRule(organization = "org.slf4j") + ), + "junit" % "junit" % "4.8.1" % "test", + "org.scalatest" % "scalatest_2.9.2" % "1.7.2" % "test" +) + +parallelExecution in Test := false http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-scrunch/pom.xml ---------------------------------------------------------------------- diff --git a/crunch-scrunch/pom.xml b/crunch-scrunch/pom.xml new file mode 100644 index 0000000..0e3eac8 --- /dev/null +++ b/crunch-scrunch/pom.xml @@ -0,0 +1,199 @@ +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.crunch</groupId> + <artifactId>crunch-parent</artifactId> + <version>0.3.0-incubating-SNAPSHOT</version> + </parent> + + <artifactId>crunch-scrunch</artifactId> + <name>Crunch for Scala</name> + + <dependencies> + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-library</artifactId> + </dependency> + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-compiler</artifactId> + </dependency> + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>jline</artifactId> + </dependency> + <dependency> + <groupId>org.apache.crunch</groupId> + <artifactId>crunch</artifactId> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-client</artifactId> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.hbase</groupId> + <artifactId>hbase</artifactId> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + </dependency> + <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_${scala.version}</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.crunch</groupId> + <artifactId>crunch-test</artifactId> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-assembly-plugin</artifactId> + <executions> + <execution> + <id>jar-with-dependencies</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + <configuration> + <descriptorRefs> + <descriptorRef>jar-with-dependencies</descriptorRef> + </descriptorRefs> + </configuration> + </execution> + <execution> + <id>make-assembly</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + <configuration> + <descriptors> + <descriptor>${basedir}/src/main/assembly/release.xml</descriptor> + </descriptors> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.scala-tools</groupId> + <artifactId>maven-scala-plugin</artifactId> + <executions> + <execution> + <goals> + <goal>compile</goal> + <goal>testCompile</goal> + </goals> + <configuration> + <args> + <arg>-deprecation</arg> + <arg>-dependencyfile</arg> + <arg>${project.build.directory}/.scala_dependencies</arg> + </args> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <configuration> + <useFile>false</useFile> + <disableXmlReport>true</disableXmlReport> + <includes> + <include>${project.build.testSourceDirectory}/**/*Test.*</include> + <include>${project.build.testSourceDirectory}/**/*Suite.*</include> + </includes> + </configuration> + </plugin> + <!-- We put slow-running tests into src/it and run them during the + integration-test phase using the failsafe plugin. This way + developers can run unit tests conveniently from the IDE or via + "mvn package" from the command line without triggering time + consuming integration tests. --> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + <executions> + <execution> + <id>add-test-source</id> + <phase>validate</phase> + <goals> + <goal>add-test-source</goal> + </goals> + <configuration> + <sources> + <source>${basedir}/src/it/java</source> + <source>${basedir}/src/it/scala</source> + </sources> + </configuration> + </execution> + <execution> + <id>add-test-resource</id> + <phase>validate</phase> + <goals> + <goal>add-test-resource</goal> + </goals> + <configuration> + <resources> + <resource> + <directory>${basedir}/src/it/resources</directory> + </resource> + </resources> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-failsafe-plugin</artifactId> + <configuration> + <testSourceDirectory>${basedir}/src/it/scala</testSourceDirectory> + <useFile>false</useFile> + <disableXmlReport>true</disableXmlReport> + <includes> + <include>**/*Test.*</include> + <include>**/*Suite.*</include> + </includes> + </configuration> + <executions> + <execution> + <goals> + <goal>integration-test</goal> + <goal>verify</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-scrunch/src/it/resources/log4j.properties ---------------------------------------------------------------------- diff --git a/crunch-scrunch/src/it/resources/log4j.properties b/crunch-scrunch/src/it/resources/log4j.properties new file mode 100644 index 0000000..c7847f8 --- /dev/null +++ b/crunch-scrunch/src/it/resources/log4j.properties @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ***** Set root logger level to INFO and its only appender to A. +log4j.logger.org.apache.crunch=info, A + +# Log warnings on Hadoop for the local runner when testing +log4j.logger.org.apache.hadoop=warn, A + +# ***** A is set to be a ConsoleAppender. +log4j.appender.A=org.apache.log4j.ConsoleAppender +# ***** A uses PatternLayout. +log4j.appender.A.layout=org.apache.log4j.PatternLayout +log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n +
