QChris has submitted this change and it was merged. Change subject: Adding refinery-tools and pom.xml content ......................................................................
Adding refinery-tools and pom.xml content Change-Id: Ic52969d79eb302cb80f50122703d576a93e783e9 --- M pom.xml A refinery-tools/README.txt M refinery-tools/pom.xml A refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Dump.java A refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Info.java A refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Store.java 6 files changed, 498 insertions(+), 0 deletions(-) Approvals: QChris: Verified; Looks good to me, approved diff --git a/pom.xml b/pom.xml index e69de29..1117b7d 100644 --- a/pom.xml +++ b/pom.xml @@ -0,0 +1,212 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <name>Wikimedia Analytics Refinery</name> + <description>Wikimedia Analytics utilities for processing data.</description> + <url>https://github.com/wikimedia/analytics-refinery-source</url> + + <groupId>org.wikimedia.analytics.refinery</groupId> + <artifactId>refinery</artifactId> + <version>0.0.1-SNAPSHOT</version> + <packaging>pom</packaging> + + + <modules> + <module>refinery-tools</module> + </modules> + + <scm> + <url>scm:git:g...@github.com/wikimedia/analytics-refinery-source.git</url> + <connection>scm:git:g...@github.com/wikimedia/analytics-refinery-source.git</connection> + <developerConnection>scm:git:g...@github.com/wikimedia/analytics-refinery-source.git</developerConnection> + </scm> + + + <organization> + <name>Wikimedia Foundation</name> + <url>http://www.wikimediafoundation.org</url> + </organization> + + <licenses> + <license> + <name>Apache License 2.0</name> + <url>http://www.apache.org/licenses/LICENSE-2.0.html</url> + </license> + </licenses> + + <mailingLists> + <mailingList> + <name>Analytics Mailinglist</name> + <subscribe>https://lists.wikimedia.org/mailman/listinfo/analytics</subscribe> + <unsubscribe>https://lists.wikimedia.org/mailman/listinfo/analytics</unsubscribe> + <post>analyt...@lists.wikimedia.org</post> + <archive>http://lists.wikimedia.org/pipermail/analytics/</archive> + </mailingList> + </mailingLists> + + <repositories> + <repository> + <id>central</id> + <url>http://repo1.maven.org/maven2</url> + <releases> + <enabled>false</enabled> + </releases> + <snapshots> + <enabled>false</enabled> + </snapshots> + </repository> + <repository> + <id>wmf-mirrored</id> + <url>http://archiva.wikimedia.org/repository/mirrored/</url> + </repository> + </repositories> + + <pluginRepositories> + <pluginRepository> + <id>central</id> + <url>http://repo1.maven.org/maven2</url> + <releases> + <enabled>false</enabled> + </releases> + <snapshots> + <enabled>false</enabled> + </snapshots> + </pluginRepository> + <pluginRepository> + <id>wmf-mirrored</id> + <url>http://archiva.wikimedia.org/repository/mirrored/</url> + </pluginRepository> + </pluginRepositories> + + <distributionManagement> + <repository> + <id>archiva.internal</id> + <name>Internal Release Repository</name> + <url>http://archiva.wikimedia.org/repository/internal/</url> + </repository> + <snapshotRepository> + <id>archiva.snapshots</id> + <name>Internal Snapshot Repository</name> + <url>http://archiva.wikimedia.org/repository/snapshots/</url> + </snapshotRepository> + </distributionManagement> + + <dependencyManagement> + <dependencies> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + <version>2.0.0-cdh4.3.1</version> + </dependency> + </dependencies> + </dependencyManagement> + + <build> + <plugins> + <plugin> + <artifactId>maven-compiler-plugin</artifactId> + <version>2.3.2</version> + <configuration> + <source>1.6</source> + <target>1.6</target> + <compilerArgument></compilerArgument> + </configuration> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-javadoc-plugin</artifactId> + <version>2.9</version> + <configuration> + <linksource>true</linksource> + </configuration> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-site-plugin</artifactId> + <version>3.0</version> + <dependencies> + <dependency> + <groupId>org.apache.maven.wagon</groupId> + <artifactId>wagon-ssh-external</artifactId> + <version>2.2</version> + </dependency> + </dependencies> + + <configuration> + <reportPlugins> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>findbugs-maven-plugin</artifactId> + <version>2.5.2</version> + <!-- + <threshold>Normal</threshold> High|Normal|Low|Exp|Ignore <effort>Default</effort>Min|Default|Max + <debug>true</debug> + <failOnError>false</failOnError> + --> + </plugin> + </reportPlugins> + </configuration> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-enforcer-plugin</artifactId> + <version>1.0</version> + <executions> + <execution> + <id>enforce-maven</id> + <goals> + <goal>enforce</goal> + </goals> + <configuration> + <rules> + <requireMavenVersion> + <version>(,2.1.0),(2.1.0,2.2.0),(2.2.0,)</version> + <message> + Maven 2.1.0 and 2.2.0 produce incorrect GPG signatures and checksums respectively. + </message> + </requireMavenVersion> + </rules> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + + <pluginManagement> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-release-plugin</artifactId> + <version>2.1</version> + <configuration> + <mavenExecutorId>forked-path</mavenExecutorId> + <useReleaseProfile>false</useReleaseProfile> + <arguments>-Psonatype-oss-release</arguments> + </configuration> + </plugin> + </plugins> + </pluginManagement> + </build> + + <reporting> + <plugins> + <plugin> + <artifactId>maven-javadoc-plugin</artifactId> + <version>2.9</version> + <configuration> + <linksource>true</linksource> + </configuration> + </plugin> + </plugins> + </reporting> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <skip.tests>false</skip.tests> + </properties> + + </project> diff --git a/refinery-tools/README.txt b/refinery-tools/README.txt new file mode 100644 index 0000000..e373c2a --- /dev/null +++ b/refinery-tools/README.txt @@ -0,0 +1,14 @@ +The kraken-toolbelt.jar contains the following tools: + +* org.wikimedia.analytics.refinery.tools.Dump + + Dumps a sequence file to stdout. + +* org.wikimedia.analytics.refinery.tools.Info + + Dumps info about a sequence file. + +* org.wikimedia.analytics.refinery.tools.Store + + Stores data into a snappy block-compressed sequence file. + diff --git a/refinery-tools/pom.xml b/refinery-tools/pom.xml index e69de29..8a5d646 100644 --- a/refinery-tools/pom.xml +++ b/refinery-tools/pom.xml @@ -0,0 +1,45 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.wikimedia.analytics.refinery</groupId> + <artifactId>refinery</artifactId> + <version>0.0.1-SNAPSHOT</version> + </parent> + + <artifactId>refinery-tools</artifactId> + <name>Wikimedia Analytics Refinery Tools</name> + <packaging>jar</packaging> + + <dependencies> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>2.0</version> + <configuration> + <shadedArtifactAttached>false</shadedArtifactAttached> + </configuration> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <createDependencyReducedPom>false</createDependencyReducedPom> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> diff --git a/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Dump.java b/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Dump.java new file mode 100644 index 0000000..ff3b74f --- /dev/null +++ b/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Dump.java @@ -0,0 +1,66 @@ +// Copyright 2014 Wikimedia Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package org.wikimedia.analytics.refinery.tools; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Writable; + +/** + * Dumps a sequence file to stdout. + * <p> + * Each value of the sequence file is printed on a separate line. + */ +public class Dump { + /** + * Dumps a sequence file and dumps it to stdout. + * @param args the first item is used as name of the file be read. + * @throws IOException if whatever IO problem occurs. + * @throws IllegalAccessException if instantiating Writables fails. + * @throws InstantiationException if instantiating Writables fails. + */ + public static void main(String[] args) throws IOException, + InstantiationException, IllegalAccessException { + if (args == null || args.length != 1) { + System.err.println("Usage: <file>\n" + "\n" + + "<file> - Read this file as a Hadoop SequenceFile and output text to stdout."); + System.exit(1); + } + + Path path = new Path(args[0]); + + Configuration conf = new Configuration(); + SequenceFile.Reader reader = new SequenceFile.Reader(conf, + SequenceFile.Reader.file(path)); + try { + Writable key = (Writable) reader.getKeyClass().newInstance(); + Writable value = (Writable) reader.getValueClass().newInstance(); + + while (reader.next(key, value)) { + String valueStr = value.toString(); + if (valueStr.endsWith("\n")) { + System.out.print(valueStr); + } else { + System.out.println(valueStr); + } + } + } finally { + reader.close(); + } + } +} diff --git a/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Info.java b/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Info.java new file mode 100644 index 0000000..b7d8d61 --- /dev/null +++ b/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Info.java @@ -0,0 +1,81 @@ +// Copyright 2014 Wikimedia Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package org.wikimedia.analytics.refinery.tools; + +import java.io.IOException; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.SequenceFile.Metadata; +import org.apache.hadoop.io.Text; + +/** + * Dumps information about a sequence file to stdout. + */ +public class Info { + + /** + * Renders a row of key, and value for the user + * @param key + * @param value + */ + private static void renderKeyValue(String key, String value) { + System.out.println(key + ": " + value); + } + + /** + * Dumps information about a sequence file to stdout. + * + * @param args the first item is the filename of the file to get + * information about. + * @throws IOException if file cannot be opened, read, ... + */ + public static void main(String[] args) throws IOException { + if (args == null || args.length != 1) { + System.err.println("Usage: <file>\n" + + "\n" + + "<file> - file to get info about."); + System.exit(1); + } + Path path = new Path(args[0]); + Configuration conf = new Configuration(); + SequenceFile.Reader reader = new SequenceFile.Reader(conf, + SequenceFile.Reader.file(path)); + + String compressionType = reader.getCompressionType().toString(); + renderKeyValue("CompressionType", compressionType); + + String codecName = reader.getCompressionCodec().getClass().getName(); + String codecExt = reader.getCompressionCodec().getDefaultExtension(); + renderKeyValue("CompressionCodec", codecName + " (" + codecExt + ")"); + + + renderKeyValue("Key", reader.getKeyClassName()); + renderKeyValue("Value", reader.getValueClassName()); + + Metadata metadata = reader.getMetadata(); + int metadataSize = metadata.getMetadata().size(); + renderKeyValue("Metadata", "(" + metadataSize + " metadata entries)"); + for (Entry<Text, Text> entry : metadata.getMetadata().entrySet()) { + String key = entry.getKey().toString(); + String value = entry.getValue().toString(); + renderKeyValue(" * " + key, value); + } + + reader.close(); + } +} diff --git a/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Store.java b/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Store.java new file mode 100644 index 0000000..44027a5 --- /dev/null +++ b/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Store.java @@ -0,0 +1,80 @@ +// Copyright 2014 Wikimedia Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package org.wikimedia.analytics.refinery.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; + +/** + * Stores data into a snappy block-compressed sequence file. + * <p> + * Data to store is read line-by-line from stdin. Each line is treated as + * separate value. Keys are increasing, starting by 0. + */ +public class Store { + /** + * Stores data into a snappy block-compressed sequence file. + * + * @param args + * the first item is used as filename for the destination + * sequence file + * @throws IOException + * if whatever IO problem occurs + */ + public static void main(String[] args) throws IOException { + if (args == null || args.length != 1) { + System.err.println("Usage: <file>\n" + "\n" + + "<file> - store stdin as SequenceFile into this file."); + System.exit(1); + } + + Path path = new Path(args[0]); + System.err.println("Reading from stdin, storing as " + path); + + BufferedReader reader = new BufferedReader(new InputStreamReader( + System.in)); + + Configuration conf = new Configuration(); + SequenceFile.Writer writer = SequenceFile.createWriter(conf, + SequenceFile.Writer.file(path), + SequenceFile.Writer.keyClass(LongWritable.class), + SequenceFile.Writer.valueClass(Text.class), + SequenceFile.Writer.compression( + SequenceFile.CompressionType.BLOCK, + new org.apache.hadoop.io.compress.SnappyCodec())); + + try { + String line; + long key = 0; + + // We loop over the lines and append them to the writer. + // + // Note that we do not treat the line's trailing newline as part of + // the line. + while ((line = reader.readLine()) != null) { + writer.append(new LongWritable(key++), new Text(line)); + } + } finally { + writer.close(); + } + } +} -- To view, visit https://gerrit.wikimedia.org/r/133520 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ic52969d79eb302cb80f50122703d576a93e783e9 Gerrit-PatchSet: 2 Gerrit-Project: analytics/refinery/source Gerrit-Branch: master Gerrit-Owner: Ottomata <o...@wikimedia.org> Gerrit-Reviewer: Milimetric <dandree...@wikimedia.org> Gerrit-Reviewer: Nuria <nu...@wikimedia.org> Gerrit-Reviewer: Ottomata <o...@wikimedia.org> Gerrit-Reviewer: QChris <christ...@quelltextlich.at> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits