[MediaWiki-commits] [Gerrit] Adding refinery-tools and pom.xml content - change (analytics...source)

QChris (Code Review) Thu, 22 May 2014 06:52:07 -0700

QChris has submitted this change and it was merged.

Change subject: Adding refinery-tools and pom.xml content
......................................................................



Adding refinery-tools and pom.xml content

Change-Id: Ic52969d79eb302cb80f50122703d576a93e783e9
---
M pom.xml
A refinery-tools/README.txt
M refinery-tools/pom.xml
A refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Dump.java
A refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Info.java
A refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Store.java
6 files changed, 498 insertions(+), 0 deletions(-)

Approvals:
  QChris: Verified; Looks good to me, approved



diff --git a/pom.xml b/pom.xml
index e69de29..1117b7d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -0,0 +1,212 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+
+  <name>Wikimedia Analytics Refinery</name>
+  <description>Wikimedia Analytics utilities for processing data.</description>
+  <url>https://github.com/wikimedia/analytics-refinery-source</url>
+
+  <groupId>org.wikimedia.analytics.refinery</groupId>
+  <artifactId>refinery</artifactId>
+  <version>0.0.1-SNAPSHOT</version>
+  <packaging>pom</packaging>
+
+
+  <modules>
+    <module>refinery-tools</module>
+  </modules>
+
+  <scm>
+    <url>scm:git:g...@github.com/wikimedia/analytics-refinery-source.git</url>
+    
<connection>scm:git:g...@github.com/wikimedia/analytics-refinery-source.git</connection>
+    
<developerConnection>scm:git:g...@github.com/wikimedia/analytics-refinery-source.git</developerConnection>
+  </scm>
+
+
+  <organization>
+    <name>Wikimedia Foundation</name>
+    <url>http://www.wikimediafoundation.org</url>
+  </organization>
+
+  <licenses>
+    <license>
+      <name>Apache License 2.0</name>
+      <url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
+    </license>
+  </licenses>
+
+  <mailingLists>
+    <mailingList>
+      <name>Analytics Mailinglist</name>
+      
<subscribe>https://lists.wikimedia.org/mailman/listinfo/analytics</subscribe>
+      
<unsubscribe>https://lists.wikimedia.org/mailman/listinfo/analytics</unsubscribe>
+      <post>analyt...@lists.wikimedia.org</post>
+      <archive>http://lists.wikimedia.org/pipermail/analytics/</archive>
+    </mailingList>
+  </mailingLists>
+
+  <repositories>
+    <repository>
+      <id>central</id>
+      <url>http://repo1.maven.org/maven2</url>
+      <releases>
+        <enabled>false</enabled>
+      </releases>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+    </repository>
+    <repository>
+      <id>wmf-mirrored</id>
+      <url>http://archiva.wikimedia.org/repository/mirrored/</url>
+    </repository>
+  </repositories>
+
+  <pluginRepositories>
+    <pluginRepository>
+      <id>central</id>
+      <url>http://repo1.maven.org/maven2</url>
+      <releases>
+        <enabled>false</enabled>
+      </releases>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+    </pluginRepository>
+    <pluginRepository>
+      <id>wmf-mirrored</id>
+      <url>http://archiva.wikimedia.org/repository/mirrored/</url>
+    </pluginRepository>
+  </pluginRepositories>
+
+  <distributionManagement>
+    <repository>
+      <id>archiva.internal</id>
+      <name>Internal Release Repository</name>
+      <url>http://archiva.wikimedia.org/repository/internal/</url>
+    </repository>
+    <snapshotRepository>
+      <id>archiva.snapshots</id>
+      <name>Internal Snapshot Repository</name>
+      <url>http://archiva.wikimedia.org/repository/snapshots/</url>
+    </snapshotRepository>
+  </distributionManagement>
+
+  <dependencyManagement>
+    <dependencies>
+      <dependency>
+        <groupId>org.apache.hadoop</groupId>
+        <artifactId>hadoop-common</artifactId>
+        <version>2.0.0-cdh4.3.1</version>
+      </dependency>
+    </dependencies>
+  </dependencyManagement>
+
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>2.3.2</version>
+        <configuration>
+          <source>1.6</source>
+          <target>1.6</target>
+          <compilerArgument></compilerArgument>
+        </configuration>
+      </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-javadoc-plugin</artifactId>
+        <version>2.9</version>
+        <configuration>
+          <linksource>true</linksource>
+        </configuration>
+      </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-site-plugin</artifactId>
+        <version>3.0</version>
+        <dependencies>
+          <dependency>
+            <groupId>org.apache.maven.wagon</groupId>
+            <artifactId>wagon-ssh-external</artifactId>
+            <version>2.2</version>
+          </dependency>
+        </dependencies>
+
+        <configuration>
+          <reportPlugins>
+            <plugin>
+              <groupId>org.codehaus.mojo</groupId>
+              <artifactId>findbugs-maven-plugin</artifactId>
+              <version>2.5.2</version>
+              <!--
+                <threshold>Normal</threshold> High|Normal|Low|Exp|Ignore 
<effort>Default</effort>Min|Default|Max
+                <debug>true</debug>
+                <failOnError>false</failOnError>
+                -->
+              </plugin>
+            </reportPlugins>
+          </configuration>
+        </plugin>
+
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-enforcer-plugin</artifactId>
+          <version>1.0</version>
+          <executions>
+            <execution>
+              <id>enforce-maven</id>
+              <goals>
+                <goal>enforce</goal>
+              </goals>
+              <configuration>
+                <rules>
+                  <requireMavenVersion>
+                    <version>(,2.1.0),(2.1.0,2.2.0),(2.2.0,)</version>
+                    <message>
+                      Maven 2.1.0 and 2.2.0 produce incorrect GPG signatures 
and checksums respectively.
+                    </message>
+                  </requireMavenVersion>
+                </rules>
+              </configuration>
+            </execution>
+          </executions>
+        </plugin>
+      </plugins>
+
+      <pluginManagement>
+        <plugins>
+          <plugin>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-release-plugin</artifactId>
+            <version>2.1</version>
+            <configuration>
+              <mavenExecutorId>forked-path</mavenExecutorId>
+              <useReleaseProfile>false</useReleaseProfile>
+              <arguments>-Psonatype-oss-release</arguments>
+            </configuration>
+          </plugin>
+        </plugins>
+      </pluginManagement>
+    </build>
+
+    <reporting>
+      <plugins>
+        <plugin>
+          <artifactId>maven-javadoc-plugin</artifactId>
+          <version>2.9</version>
+          <configuration>
+            <linksource>true</linksource>
+          </configuration>
+        </plugin>
+      </plugins>
+    </reporting>
+
+    <properties>
+      <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+      <skip.tests>false</skip.tests>
+    </properties>
+
+  </project>
diff --git a/refinery-tools/README.txt b/refinery-tools/README.txt
new file mode 100644
index 0000000..e373c2a
--- /dev/null
+++ b/refinery-tools/README.txt
@@ -0,0 +1,14 @@
+The kraken-toolbelt.jar contains the following tools:
+
+* org.wikimedia.analytics.refinery.tools.Dump
+
+  Dumps a sequence file to stdout.
+
+* org.wikimedia.analytics.refinery.tools.Info
+
+  Dumps info about a sequence file.
+
+* org.wikimedia.analytics.refinery.tools.Store
+
+  Stores data into a snappy block-compressed sequence file.
+
diff --git a/refinery-tools/pom.xml b/refinery-tools/pom.xml
index e69de29..8a5d646 100644
--- a/refinery-tools/pom.xml
+++ b/refinery-tools/pom.xml
@@ -0,0 +1,45 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.wikimedia.analytics.refinery</groupId>
+    <artifactId>refinery</artifactId>
+    <version>0.0.1-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>refinery-tools</artifactId>
+  <name>Wikimedia Analytics Refinery Tools</name>
+  <packaging>jar</packaging>
+
+  <dependencies>
+    <dependency>
+       <groupId>org.apache.hadoop</groupId>
+       <artifactId>hadoop-common</artifactId>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>2.0</version>
+        <configuration>
+          <shadedArtifactAttached>false</shadedArtifactAttached>
+        </configuration>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <createDependencyReducedPom>false</createDependencyReducedPom>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git 
a/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Dump.java 
b/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Dump.java
new file mode 100644
index 0000000..ff3b74f
--- /dev/null
+++ 
b/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Dump.java
@@ -0,0 +1,66 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.tools;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * Dumps a sequence file to stdout.
+ * <p>
+ * Each value of the sequence file is printed on a separate line.
+ */
+public class Dump {
+    /**
+     * Dumps a sequence file and dumps it to stdout.
+     * @param args the first item is used as name of the file be read.
+     * @throws IOException if whatever IO problem occurs.
+     * @throws IllegalAccessException if instantiating Writables fails.
+     * @throws InstantiationException if instantiating Writables fails.
+     */
+    public static void main(String[] args) throws IOException,
+            InstantiationException, IllegalAccessException {
+        if (args == null || args.length != 1) {
+            System.err.println("Usage: <file>\n" + "\n"
+                    + "<file> - Read this file as a Hadoop SequenceFile and 
output text to stdout.");
+            System.exit(1);
+        }
+
+        Path path = new Path(args[0]);
+
+        Configuration conf = new Configuration();
+        SequenceFile.Reader reader = new SequenceFile.Reader(conf,
+                SequenceFile.Reader.file(path));
+        try {
+            Writable key = (Writable) reader.getKeyClass().newInstance();
+            Writable value = (Writable) reader.getValueClass().newInstance();
+
+            while (reader.next(key, value)) {
+                String valueStr = value.toString();
+                if (valueStr.endsWith("\n")) {
+                    System.out.print(valueStr);
+                } else {
+                    System.out.println(valueStr);
+                }
+            }
+        } finally {
+            reader.close();
+        }
+    }
+}
diff --git 
a/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Info.java 
b/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Info.java
new file mode 100644
index 0000000..b7d8d61
--- /dev/null
+++ 
b/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Info.java
@@ -0,0 +1,81 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.tools;
+
+import java.io.IOException;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Metadata;
+import org.apache.hadoop.io.Text;
+
+/**
+ * Dumps information about a sequence file to stdout.
+ */
+public class Info {
+
+    /**
+     * Renders a row of key, and value for the user
+     * @param key
+     * @param value
+     */
+    private static void renderKeyValue(String key, String value) {
+        System.out.println(key + ": " + value);
+    }
+
+    /**
+     * Dumps information about a sequence file to stdout.
+     *
+     * @param args the first item is the filename of the file to get
+     *     information about.
+     * @throws IOException if file cannot be opened, read, ...
+     */
+    public static void main(String[] args) throws IOException {
+        if (args == null || args.length != 1) {
+            System.err.println("Usage: <file>\n"
+                    + "\n"
+                    + "<file> - file to get info about.");
+            System.exit(1);
+        }
+        Path path = new Path(args[0]);
+        Configuration conf = new Configuration();
+        SequenceFile.Reader reader = new SequenceFile.Reader(conf,
+                SequenceFile.Reader.file(path));
+
+        String compressionType = reader.getCompressionType().toString();
+        renderKeyValue("CompressionType", compressionType);
+
+        String codecName = reader.getCompressionCodec().getClass().getName();
+        String codecExt = reader.getCompressionCodec().getDefaultExtension();
+        renderKeyValue("CompressionCodec", codecName + " (" + codecExt + ")");
+
+
+        renderKeyValue("Key", reader.getKeyClassName());
+        renderKeyValue("Value", reader.getValueClassName());
+
+        Metadata metadata = reader.getMetadata();
+        int metadataSize = metadata.getMetadata().size();
+        renderKeyValue("Metadata", "(" + metadataSize + " metadata entries)");
+        for (Entry<Text, Text> entry : metadata.getMetadata().entrySet()) {
+            String key = entry.getKey().toString();
+            String value = entry.getValue().toString();
+            renderKeyValue(" * " + key, value);
+        }
+
+        reader.close();
+    }
+}
diff --git 
a/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Store.java
 
b/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Store.java
new file mode 100644
index 0000000..44027a5
--- /dev/null
+++ 
b/refinery-tools/src/main/java/org/wikimedia/analytics/refinery/tools/Store.java
@@ -0,0 +1,80 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+
+/**
+ * Stores data into a snappy block-compressed sequence file.
+ * <p>
+ * Data to store is read line-by-line from stdin. Each line is treated as
+ * separate value. Keys are increasing, starting by 0.
+ */
+public class Store {
+    /**
+     * Stores data into a snappy block-compressed sequence file.
+     *
+     * @param args
+     *            the first item is used as filename for the destination
+     *            sequence file
+     * @throws IOException
+     *             if whatever IO problem occurs
+     */
+    public static void main(String[] args) throws IOException {
+        if (args == null || args.length != 1) {
+            System.err.println("Usage: <file>\n" + "\n"
+                    + "<file> - store stdin as SequenceFile into this file.");
+            System.exit(1);
+        }
+
+        Path path = new Path(args[0]);
+        System.err.println("Reading from stdin, storing as " + path);
+
+        BufferedReader reader = new BufferedReader(new InputStreamReader(
+                System.in));
+
+        Configuration conf = new Configuration();
+        SequenceFile.Writer writer = SequenceFile.createWriter(conf,
+                SequenceFile.Writer.file(path),
+                SequenceFile.Writer.keyClass(LongWritable.class),
+                SequenceFile.Writer.valueClass(Text.class),
+                SequenceFile.Writer.compression(
+                        SequenceFile.CompressionType.BLOCK,
+                        new org.apache.hadoop.io.compress.SnappyCodec()));
+
+        try {
+            String line;
+            long key = 0;
+
+            // We loop over the lines and append them to the writer.
+            //
+            // Note that we do not treat the line's trailing newline as part of
+            // the line.
+            while ((line = reader.readLine()) != null) {
+                writer.append(new LongWritable(key++), new Text(line));
+            }
+        } finally {
+            writer.close();
+        }
+    }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/133520
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ic52969d79eb302cb80f50122703d576a93e783e9
Gerrit-PatchSet: 2
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: Ottomata <o...@wikimedia.org>
Gerrit-Reviewer: Milimetric <dandree...@wikimedia.org>
Gerrit-Reviewer: Nuria <nu...@wikimedia.org>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>
Gerrit-Reviewer: QChris <christ...@quelltextlich.at>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Adding refinery-tools and pom.xml content - change (analytics...source)

Reply via email to