[SPARK] Avoid using -profile for Spark module and Make spark-2.0/2.1/2.2 and spark-common as the submodule of hivemall-spark
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/e2fb6f86 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/e2fb6f86 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/e2fb6f86 Branch: refs/heads/v0.5.0 Commit: e2fb6f8661ebbd34be7040ba3dd77d4522cd5a54 Parents: 0343700 Author: Makoto Yui <[email protected]> Authored: Mon Jan 22 19:59:52 2018 +0900 Committer: Makoto Yui <[email protected]> Committed: Mon Jan 22 19:59:52 2018 +0900 ---------------------------------------------------------------------- core/pom.xml | 41 --- mixserv/pom.xml | 29 -- nlp/pom.xml | 38 --- pom.xml | 166 ++++++--- spark/common/.cache-main | Bin 0 -> 12486 bytes spark/common/pom.xml | 65 ++++ ...isticRegressionDataGeneratorUDTFWrapper.java | 109 ++++++ .../java/hivemall/ftvec/AddBiasUDFWrapper.java | 83 +++++ .../ftvec/AddFeatureIndexUDFWrapper.java | 85 +++++ .../ftvec/ExtractFeatureUDFWrapper.java | 73 ++++ .../hivemall/ftvec/ExtractWeightUDFWrapper.java | 73 ++++ .../hivemall/ftvec/SortByFeatureUDFWrapper.java | 92 +++++ .../scaling/L2NormalizationUDFWrapper.java | 95 ++++++ .../hivemall/knn/lsh/MinHashesUDFWrapper.java | 93 ++++++ .../hivemall/tools/mapred/RowIdUDFWrapper.java | 72 ++++ .../main/scala/hivemall/HivemallException.scala | 25 ++ .../spark/ml/feature/HivemallLabeledPoint.scala | 82 +++++ spark/pom.xml | 159 +++++++++ spark/scalastyle-config.xml | 333 +++++++++++++++++++ spark/spark-2.0/pom.xml | 91 +---- .../spark/streaming/HivemallStreamingOps.scala | 47 +++ spark/spark-2.1/pom.xml | 89 +---- .../spark/streaming/HivemallStreamingOps.scala | 47 +++ spark/spark-2.2/pom.xml | 100 ++---- spark/spark-common/pom.xml | 146 -------- spark/spark-common/scalastyle-config.xml | 333 ------------------- ...isticRegressionDataGeneratorUDTFWrapper.java | 109 ------ .../java/hivemall/ftvec/AddBiasUDFWrapper.java | 83 ----- .../ftvec/AddFeatureIndexUDFWrapper.java | 85 ----- .../ftvec/ExtractFeatureUDFWrapper.java | 73 ---- .../hivemall/ftvec/ExtractWeightUDFWrapper.java | 73 ---- .../hivemall/ftvec/SortByFeatureUDFWrapper.java | 92 ----- .../scaling/L2NormalizationUDFWrapper.java | 95 ------ .../hivemall/knn/lsh/MinHashesUDFWrapper.java | 93 ------ .../hivemall/tools/mapred/RowIdUDFWrapper.java | 72 ---- .../main/scala/hivemall/HivemallException.scala | 25 -- .../spark/ml/feature/HivemallLabeledPoint.scala | 82 ----- .../spark/streaming/HivemallStreamingOps.scala | 47 --- xgboost/pom.xml | 34 -- 39 files changed, 1702 insertions(+), 1827 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/core/pom.xml ---------------------------------------------------------------------- diff --git a/core/pom.xml b/core/pom.xml index 05f6ec2..c87ded1 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -39,67 +39,41 @@ <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> - <version>${hadoop.version}</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> - <version>${hadoop.version}</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> - <version>${hive.version}</version> <scope>provided</scope> - <exclusions> - <exclusion> - <artifactId>jetty</artifactId> - <groupId>org.mortbay.jetty</groupId> - </exclusion> - <exclusion> - <groupId>javax.jdo</groupId> - <artifactId>jdo2-api</artifactId> - </exclusion> - <exclusion> - <groupId>asm-parent</groupId> - <artifactId>asm-parent</artifactId> - </exclusion> - <exclusion> - <groupId>asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - </exclusions> </dependency> <dependency> <groupId>commons-cli</groupId> <artifactId>commons-cli</artifactId> - <version>1.2</version> <scope>provided</scope> </dependency> <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> - <version>1.0.4</version> <scope>provided</scope> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> - <version>1.2.17</version> <scope>provided</scope> </dependency> <dependency> <groupId>javax.jdo</groupId> <artifactId>jdo2-api</artifactId> - <version>2.3-eb</version> <scope>provided</scope> </dependency> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> - <version>${guava.version}</version> <scope>provided</scope> </dependency> @@ -159,25 +133,21 @@ <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>${junit.version}</version> <scope>test</scope> </dependency> <dependency> <groupId>org.mockito</groupId> <artifactId>mockito-core</artifactId> - <version>1.10.19</version> <scope>test</scope> </dependency> <dependency> <groupId>org.powermock</groupId> <artifactId>powermock-module-junit4</artifactId> - <version>1.6.3</version> <scope>test</scope> </dependency> <dependency> <groupId>org.powermock</groupId> <artifactId>powermock-api-mockito</artifactId> - <version>1.6.3</version> <scope>test</scope> </dependency> </dependencies> @@ -188,21 +158,10 @@ <finalName>${project.artifactId}-${project.version}</finalName> <testOutputDirectory>target/test-classes</testOutputDirectory> <plugins> - <!-- hivemall-core-xx.jar --> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <version>2.5</version> - <configuration> - <finalName>${project.artifactId}-${project.version}</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> - </configuration> - </plugin> <!-- hivemall-core-xx-with-dependencies.jar including minimum dependencies --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> - <version>3.1.0</version> <executions> <execution> <id>jar-with-dependencies</id> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/mixserv/pom.xml ---------------------------------------------------------------------- diff --git a/mixserv/pom.xml b/mixserv/pom.xml index 13d9220..09b134f 100644 --- a/mixserv/pom.xml +++ b/mixserv/pom.xml @@ -39,49 +39,26 @@ <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> - <version>${hadoop.version}</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> - <version>${hadoop.version}</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> - <version>${hive.version}</version> <scope>provided</scope> - <exclusions> - <exclusion> - <artifactId>jetty</artifactId> - <groupId>org.mortbay.jetty</groupId> - </exclusion> - <exclusion> - <groupId>javax.jdo</groupId> - <artifactId>jdo2-api</artifactId> - </exclusion> - <exclusion> - <groupId>asm-parent</groupId> - <artifactId>asm-parent</artifactId> - </exclusion> - <exclusion> - <groupId>asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - </exclusions> </dependency> <dependency> <groupId>javax.jdo</groupId> <artifactId>jdo2-api</artifactId> - <version>2.3-eb</version> <scope>provided</scope> </dependency> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> - <version>${guava.version}</version> <scope>provided</scope> </dependency> @@ -102,19 +79,16 @@ <dependency> <groupId>commons-cli</groupId> <artifactId>commons-cli</artifactId> - <version>1.2</version> <scope>compile</scope> </dependency> <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> - <version>1.0.4</version> <scope>compile</scope> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> - <version>1.2.17</version> <scope>compile</scope> </dependency> <dependency> @@ -129,13 +103,11 @@ <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>${junit.version}</version> <scope>test</scope> </dependency> <dependency> <groupId>org.mockito</groupId> <artifactId>mockito-all</artifactId> - <version>1.10.19</version> <scope>test</scope> </dependency> </dependencies> @@ -150,7 +122,6 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> - <version>3.1.0</version> <executions> <execution> <id>jar-with-dependencies</id> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/nlp/pom.xml ---------------------------------------------------------------------- diff --git a/nlp/pom.xml b/nlp/pom.xml index c667c4e..429de86 100644 --- a/nlp/pom.xml +++ b/nlp/pom.xml @@ -39,67 +39,41 @@ <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> - <version>${hadoop.version}</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> - <version>${hadoop.version}</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> - <version>${hive.version}</version> <scope>provided</scope> - <exclusions> - <exclusion> - <artifactId>jetty</artifactId> - <groupId>org.mortbay.jetty</groupId> - </exclusion> - <exclusion> - <groupId>javax.jdo</groupId> - <artifactId>jdo2-api</artifactId> - </exclusion> - <exclusion> - <groupId>asm-parent</groupId> - <artifactId>asm-parent</artifactId> - </exclusion> - <exclusion> - <groupId>asm</groupId> - <artifactId>asm</artifactId> - </exclusion> - </exclusions> </dependency> <dependency> <groupId>commons-cli</groupId> <artifactId>commons-cli</artifactId> - <version>1.2</version> <scope>provided</scope> </dependency> <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> - <version>1.0.4</version> <scope>provided</scope> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> - <version>1.2.17</version> <scope>provided</scope> </dependency> <dependency> <groupId>javax.jdo</groupId> <artifactId>jdo2-api</artifactId> - <version>2.3-eb</version> <scope>provided</scope> </dependency> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> - <version>${guava.version}</version> <scope>provided</scope> </dependency> <dependency> @@ -127,7 +101,6 @@ <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>${junit.version}</version> <scope>test</scope> </dependency> <dependency> @@ -145,21 +118,10 @@ <finalName>${project.artifactId}-${project.version}</finalName> <testOutputDirectory>target/test-classes</testOutputDirectory> <plugins> - <!-- hivemall-nlp-xx.jar --> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <version>2.5</version> - <configuration> - <finalName>${project.artifactId}-${project.version}</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> - </configuration> - </plugin> <!-- hivemall-nlp-xx-with-dependencies.jar including minimum dependencies --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> - <version>3.1.0</version> <executions> <execution> <id>jar-with-dependencies</id> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 444e38a..cc3bdce 100644 --- a/pom.xml +++ b/pom.xml @@ -249,6 +249,7 @@ <module>nlp</module> <module>xgboost</module> <module>mixserv</module> + <module>spark</module> </modules> <properties> @@ -256,8 +257,6 @@ <java.target.version>1.7</java.target.version> <maven.compiler.source>1.7</maven.compiler.source> <maven.compiler.target>1.7</maven.compiler.target> - <scala.version>2.11.8</scala.version> - <scala.binary.version>2.11</scala.binary.version> <maven.build.timestamp.format>yyyy</maven.build.timestamp.format> <build.year>${maven.build.timestamp}</build.year> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> @@ -315,50 +314,6 @@ <profiles> <profile> - <id>spark-2.2</id> - <modules> - <module>spark/spark-2.2</module> - <module>spark/spark-common</module> - </modules> - <properties> - <spark.version>2.2.0</spark.version> - <spark.binary.version>2.2</spark.binary.version> - </properties> - <build> - <plugins> - <!-- Spark-2.2 only supports Java 8 --> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-enforcer-plugin</artifactId> - <version>${maven-enforcer-plugin.version}</version> - <executions> - <execution> - <id>enforce-versions</id> - <phase>validate</phase> - <goals> - <goal>enforce</goal> - </goals> - <configuration> - <rules> - <requireProperty> - <property>java.source.version</property> - <regex>1.8</regex> - <regexMessage>When -Pspark-2.2 set, java.source.version must be 1.8</regexMessage> - </requireProperty> - <requireProperty> - <property>java.target.version</property> - <regex>1.8</regex> - <regexMessage>When -Pspark-2.2 set, java.target.version must be 1.8</regexMessage> - </requireProperty> - </rules> - </configuration> - </execution> - </executions> - </plugin> - </plugins> - </build> - </profile> - <profile> <id>spark-2.1</id> <modules> <module>spark/spark-2.1</module> @@ -409,6 +364,110 @@ </profile> </profiles> + <dependencyManagement> + <dependencies> + <!-- provided scope --> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + <version>${hadoop.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-mapreduce-client-core</artifactId> + <version>${hadoop.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-exec</artifactId> + <version>${hive.version}</version> + <scope>provided</scope> + <exclusions> + <exclusion> + <artifactId>jetty</artifactId> + <groupId>org.mortbay.jetty</groupId> + </exclusion> + <exclusion> + <groupId>javax.jdo</groupId> + <artifactId>jdo2-api</artifactId> + </exclusion> + <exclusion> + <groupId>asm-parent</groupId> + <artifactId>asm-parent</artifactId> + </exclusion> + <exclusion> + <groupId>asm</groupId> + <artifactId>asm</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>commons-cli</groupId> + <artifactId>commons-cli</artifactId> + <version>1.2</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + <version>1.0.4</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>log4j</groupId> + <artifactId>log4j</artifactId> + <version>1.2.17</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>javax.jdo</groupId> + <artifactId>jdo2-api</artifactId> + <version>2.3-eb</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>${guava.version}</version> + <scope>provided</scope> + </dependency> + + <!-- test scope --> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>${junit.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.mockito</groupId> + <artifactId>mockito-all</artifactId> + <version>1.10.19</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.mockito</groupId> + <artifactId>mockito-core</artifactId> + <version>1.10.19</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.powermock</groupId> + <artifactId>powermock-module-junit4</artifactId> + <version>1.6.3</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.powermock</groupId> + <artifactId>powermock-api-mockito</artifactId> + <version>1.6.3</version> + <scope>test</scope> + </dependency> + </dependencies> + </dependencyManagement> + <build> <directory>target</directory> <outputDirectory>target/classes</outputDirectory> @@ -418,6 +477,19 @@ <pluginManagement> <plugins> <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <version>3.0.2</version> + <configuration> + <finalName>${project.artifactId}-${project.version}</finalName> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>3.1.0</version> + </plugin> + <plugin> <!-- mvn formatter:format --> <groupId>net.revelc.code</groupId> <artifactId>formatter-maven-plugin</artifactId> @@ -677,7 +749,7 @@ <failOnWarning>false</failOnWarning> <sourceDirectory>${basedir}/src/main/scala</sourceDirectory> <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory> - <configLocation>spark/spark-common/scalastyle-config.xml</configLocation> + <configLocation>spark/scalastyle-config.xml</configLocation> <outputFile>${basedir}/target/scalastyle-output.xml</outputFile> <inputEncoding>${project.build.sourceEncoding}</inputEncoding> <outputEncoding>${project.reporting.outputEncoding}</outputEncoding> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/.cache-main ---------------------------------------------------------------------- diff --git a/spark/common/.cache-main b/spark/common/.cache-main new file mode 100644 index 0000000..a1b634c Binary files /dev/null and b/spark/common/.cache-main differ http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/pom.xml ---------------------------------------------------------------------- diff --git a/spark/common/pom.xml b/spark/common/pom.xml new file mode 100644 index 0000000..164b89e --- /dev/null +++ b/spark/common/pom.xml @@ -0,0 +1,65 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall-spark</artifactId> + <version>0.5.1-incubating-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + + <artifactId>hivemall-spark-common</artifactId> + <name>Hivemall on Spark Common</name> + <packaging>jar</packaging> + + <properties> + <main.basedir>${project.parent.parent.basedir}</main.basedir> + </properties> + + <dependencies> + <!-- provided scope --> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-mapreduce-client-core</artifactId> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-exec</artifactId> + <scope>provided</scope> + </dependency> + + <!-- compile scope --> + <dependency> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall-core</artifactId> + <scope>compile</scope> + </dependency> + </dependencies> + +</project> + http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/src/main/java/hivemall/dataset/LogisticRegressionDataGeneratorUDTFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/common/src/main/java/hivemall/dataset/LogisticRegressionDataGeneratorUDTFWrapper.java b/spark/common/src/main/java/hivemall/dataset/LogisticRegressionDataGeneratorUDTFWrapper.java new file mode 100644 index 0000000..cf10ed7 --- /dev/null +++ b/spark/common/src/main/java/hivemall/dataset/LogisticRegressionDataGeneratorUDTFWrapper.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.dataset; + +import hivemall.UDTFWithOptions; + +import java.lang.reflect.Field; +import java.lang.reflect.Method; +import java.util.Random; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Options; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.Collector; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; + +/** + * A wrapper of [[hivemall.dataset.LogisticRegressionDataGeneratorUDTF]]. This wrapper is needed + * because Spark cannot handle HadoopUtils#getTaskId() correctly. + */ +@Description(name = "lr_datagen", + value = "_FUNC_(options string) - Generates a logistic regression dataset") +public final class LogisticRegressionDataGeneratorUDTFWrapper extends UDTFWithOptions { + private transient LogisticRegressionDataGeneratorUDTF udtf = + new LogisticRegressionDataGeneratorUDTF(); + + @Override + protected Options getOptions() { + Options options = null; + try { + Method m = udtf.getClass().getDeclaredMethod("getOptions"); + m.setAccessible(true); + options = (Options) m.invoke(udtf); + } catch (Exception e) { + e.printStackTrace(); + } + return options; + } + + @SuppressWarnings("all") + @Override + protected CommandLine processOptions(ObjectInspector[] objectInspectors) + throws UDFArgumentException { + CommandLine commands = null; + try { + Method m = udtf.getClass().getDeclaredMethod("processOptions"); + m.setAccessible(true); + commands = (CommandLine) m.invoke(udtf, objectInspectors); + } catch (Exception e) { + e.printStackTrace(); + } + return commands; + } + + @Override + public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { + try { + // Extract a collector for LogisticRegressionDataGeneratorUDTF + Field collector = GenericUDTF.class.getDeclaredField("collector"); + collector.setAccessible(true); + udtf.setCollector((Collector) collector.get(this)); + + // To avoid HadoopUtils#getTaskId() + Class<?> clazz = udtf.getClass(); + Field rnd1 = clazz.getDeclaredField("rnd1"); + Field rnd2 = clazz.getDeclaredField("rnd2"); + Field r_seed = clazz.getDeclaredField("r_seed"); + r_seed.setAccessible(true); + final long seed = r_seed.getLong(udtf) + (int) Thread.currentThread().getId(); + rnd1.setAccessible(true); + rnd2.setAccessible(true); + rnd1.set(udtf, new Random(seed)); + rnd2.set(udtf, new Random(seed + 1)); + } catch (Exception e) { + e.printStackTrace(); + } + return udtf.initialize(argOIs); + } + + @Override + public void process(Object[] objects) throws HiveException { + udtf.process(objects); + } + + @Override + public void close() throws HiveException { + udtf.close(); + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/src/main/java/hivemall/ftvec/AddBiasUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/common/src/main/java/hivemall/ftvec/AddBiasUDFWrapper.java b/spark/common/src/main/java/hivemall/ftvec/AddBiasUDFWrapper.java new file mode 100644 index 0000000..b454fd9 --- /dev/null +++ b/spark/common/src/main/java/hivemall/ftvec/AddBiasUDFWrapper.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.ftvec; + +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.*; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; + +/** + * A wrapper of [[hivemall.ftvec.AddBiasUDF]]. + * + * NOTE: This is needed to avoid the issue of Spark reflection. That is, spark cannot handle List<> + * as a return type in Hive UDF. Therefore, the type must be passed via ObjectInspector. + */ +@Description(name = "add_bias", + value = "_FUNC_(features in array<string>) - Returns features with a bias as array<string>") +@UDFType(deterministic = true, stateful = false) +public class AddBiasUDFWrapper extends GenericUDF { + private AddBiasUDF udf = new AddBiasUDF(); + private ListObjectInspector argumentOI = null; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 1) { + throw new UDFArgumentLengthException( + "add_bias() has an single arguments: array<string> features"); + } + + switch (arguments[0].getCategory()) { + case LIST: + argumentOI = (ListObjectInspector) arguments[0]; + ObjectInspector elmOI = argumentOI.getListElementObjectInspector(); + if (elmOI.getCategory().equals(Category.PRIMITIVE)) { + if (((PrimitiveObjectInspector) elmOI).getPrimitiveCategory() == PrimitiveCategory.STRING) { + break; + } + } + default: + throw new UDFArgumentTypeException(0, "Type mismatch: features"); + } + + return ObjectInspectorFactory.getStandardListObjectInspector(argumentOI.getListElementObjectInspector()); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert (arguments.length == 1); + @SuppressWarnings("unchecked") + final List<String> input = (List<String>) argumentOI.getList(arguments[0].get()); + return udf.evaluate(input); + } + + @Override + public String getDisplayString(String[] children) { + return "add_bias(" + Arrays.toString(children) + ")"; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/src/main/java/hivemall/ftvec/AddFeatureIndexUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/common/src/main/java/hivemall/ftvec/AddFeatureIndexUDFWrapper.java b/spark/common/src/main/java/hivemall/ftvec/AddFeatureIndexUDFWrapper.java new file mode 100644 index 0000000..0b687db --- /dev/null +++ b/spark/common/src/main/java/hivemall/ftvec/AddFeatureIndexUDFWrapper.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.ftvec; + +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.*; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +/** + * A wrapper of [[hivemall.ftvec.AddFeatureIndexUDF]]. + * + * NOTE: This is needed to avoid the issue of Spark reflection. That is, spark cannot handle List<> + * as a return type in Hive UDF. Therefore, the type must be passed via ObjectInspector. + */ +@Description( + name = "add_feature_index", + value = "_FUNC_(dense features in array<double>) - Returns a feature vector with feature indices") +@UDFType(deterministic = true, stateful = false) +public class AddFeatureIndexUDFWrapper extends GenericUDF { + private AddFeatureIndexUDF udf = new AddFeatureIndexUDF(); + private ListObjectInspector argumentOI = null; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 1) { + throw new UDFArgumentLengthException( + "add_feature_index() has an single arguments: array<double> features"); + } + + switch (arguments[0].getCategory()) { + case LIST: + argumentOI = (ListObjectInspector) arguments[0]; + ObjectInspector elmOI = argumentOI.getListElementObjectInspector(); + if (elmOI.getCategory().equals(Category.PRIMITIVE)) { + if (((PrimitiveObjectInspector) elmOI).getPrimitiveCategory() == PrimitiveCategory.DOUBLE) { + break; + } + } + default: + throw new UDFArgumentTypeException(0, "Type mismatch: features"); + } + + return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert (arguments.length == 1); + @SuppressWarnings("unchecked") + final List<Double> input = (List<Double>) argumentOI.getList(arguments[0].get()); + return udf.evaluate(input); + } + + @Override + public String getDisplayString(String[] children) { + return "add_feature_index(" + Arrays.toString(children) + ")"; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/src/main/java/hivemall/ftvec/ExtractFeatureUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/common/src/main/java/hivemall/ftvec/ExtractFeatureUDFWrapper.java b/spark/common/src/main/java/hivemall/ftvec/ExtractFeatureUDFWrapper.java new file mode 100644 index 0000000..5924468 --- /dev/null +++ b/spark/common/src/main/java/hivemall/ftvec/ExtractFeatureUDFWrapper.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.ftvec; + +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.*; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +/** + * A wrapper of [[hivemall.ftvec.ExtractFeatureUDF]]. + * + * NOTE: This is needed to avoid the issue of Spark reflection. That is, spark cannot handle List<> + * as a return type in Hive UDF. Therefore, the type must be passed via ObjectInspector. + */ +@Description(name = "extract_feature", + value = "_FUNC_(feature in string) - Returns a parsed feature as string") +@UDFType(deterministic = true, stateful = false) +public class ExtractFeatureUDFWrapper extends GenericUDF { + private ExtractFeatureUDF udf = new ExtractFeatureUDF(); + private PrimitiveObjectInspector argumentOI = null; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 1) { + throw new UDFArgumentLengthException( + "extract_feature() has an single arguments: string feature"); + } + + argumentOI = (PrimitiveObjectInspector) arguments[0]; + if (argumentOI.getPrimitiveCategory() != PrimitiveCategory.STRING) { + throw new UDFArgumentTypeException(0, "Type mismatch: feature"); + } + + return PrimitiveObjectInspectorFactory.javaStringObjectInspector; + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert (arguments.length == 1); + final String input = (String) argumentOI.getPrimitiveJavaObject(arguments[0].get()); + return udf.evaluate(input); + } + + @Override + public String getDisplayString(String[] children) { + return "extract_feature(" + Arrays.toString(children) + ")"; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/src/main/java/hivemall/ftvec/ExtractWeightUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/common/src/main/java/hivemall/ftvec/ExtractWeightUDFWrapper.java b/spark/common/src/main/java/hivemall/ftvec/ExtractWeightUDFWrapper.java new file mode 100644 index 0000000..8580247 --- /dev/null +++ b/spark/common/src/main/java/hivemall/ftvec/ExtractWeightUDFWrapper.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.ftvec; + +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.*; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +/** + * A wrapper of [[hivemall.ftvec.ExtractWeightUDF]]. + * + * NOTE: This is needed to avoid the issue of Spark reflection. That is, spark cannot handle List<> + * as a return type in Hive UDF. Therefore, the type must be passed via ObjectInspector. + */ +@Description(name = "extract_weight", + value = "_FUNC_(feature in string) - Returns the weight of a feature as string") +@UDFType(deterministic = true, stateful = false) +public class ExtractWeightUDFWrapper extends GenericUDF { + private ExtractWeightUDF udf = new ExtractWeightUDF(); + private PrimitiveObjectInspector argumentOI = null; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 1) { + throw new UDFArgumentLengthException( + "extract_weight() has an single arguments: string feature"); + } + + argumentOI = (PrimitiveObjectInspector) arguments[0]; + if (argumentOI.getPrimitiveCategory() != PrimitiveCategory.STRING) { + throw new UDFArgumentTypeException(0, "Type mismatch: feature"); + } + + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.DOUBLE); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert (arguments.length == 1); + final String input = (String) argumentOI.getPrimitiveJavaObject(arguments[0].get()); + return udf.evaluate(input); + } + + @Override + public String getDisplayString(String[] children) { + return "extract_weight(" + Arrays.toString(children) + ")"; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/src/main/java/hivemall/ftvec/SortByFeatureUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/common/src/main/java/hivemall/ftvec/SortByFeatureUDFWrapper.java b/spark/common/src/main/java/hivemall/ftvec/SortByFeatureUDFWrapper.java new file mode 100644 index 0000000..584be6c --- /dev/null +++ b/spark/common/src/main/java/hivemall/ftvec/SortByFeatureUDFWrapper.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.ftvec; + +import java.util.Arrays; +import java.util.Map; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.*; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; + +/** + * A wrapper of [[hivemall.ftvec.SortByFeatureUDF]]. + * + * NOTE: This is needed to avoid the issue of Spark reflection. That is, spark cannot handle Map<> + * as a return type in Hive UDF. Therefore, the type must be passed via ObjectInspector. + */ +@Description(name = "sort_by_feature", + value = "_FUNC_(map in map<int,float>) - Returns a sorted map") +@UDFType(deterministic = true, stateful = false) +public class SortByFeatureUDFWrapper extends GenericUDF { + private SortByFeatureUDF udf = new SortByFeatureUDF(); + private MapObjectInspector argumentOI = null; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 1) { + throw new UDFArgumentLengthException( + "sorted_by_feature() has an single arguments: map<int, float> map"); + } + + switch (arguments[0].getCategory()) { + case MAP: + argumentOI = (MapObjectInspector) arguments[0]; + ObjectInspector keyOI = argumentOI.getMapKeyObjectInspector(); + ObjectInspector valueOI = argumentOI.getMapValueObjectInspector(); + if (keyOI.getCategory().equals(Category.PRIMITIVE) + && valueOI.getCategory().equals(Category.PRIMITIVE)) { + final PrimitiveCategory keyCategory = ((PrimitiveObjectInspector) keyOI).getPrimitiveCategory(); + final PrimitiveCategory valueCategory = ((PrimitiveObjectInspector) valueOI).getPrimitiveCategory(); + if (keyCategory == PrimitiveCategory.INT + && valueCategory == PrimitiveCategory.FLOAT) { + break; + } + } + default: + throw new UDFArgumentTypeException(0, "Type mismatch: map"); + } + + + return ObjectInspectorFactory.getStandardMapObjectInspector( + argumentOI.getMapKeyObjectInspector(), argumentOI.getMapValueObjectInspector()); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert (arguments.length == 1); + @SuppressWarnings("unchecked") + final Map<IntWritable, FloatWritable> input = (Map<IntWritable, FloatWritable>) argumentOI.getMap(arguments[0].get()); + return udf.evaluate(input); + } + + @Override + public String getDisplayString(String[] children) { + return "sort_by_feature(" + Arrays.toString(children) + ")"; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/common/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDFWrapper.java b/spark/common/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDFWrapper.java new file mode 100644 index 0000000..db533be --- /dev/null +++ b/spark/common/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDFWrapper.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.ftvec.scaling; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.*; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; + +/** + * A wrapper of [[hivemall.ftvec.scaling.L2NormalizationUDF]]. + * + * NOTE: This is needed to avoid the issue of Spark reflection. That is, spark-1.3 cannot handle + * List<> as a return type in Hive UDF. The type must be passed via ObjectInspector. This issues has + * been reported in SPARK-6747, so a future release of Spark makes the wrapper obsolete. + */ +public class L2NormalizationUDFWrapper extends GenericUDF { + private L2NormalizationUDF udf = new L2NormalizationUDF(); + + private transient List<Text> retValue = new ArrayList<Text>(); + private transient Converter toListText = null; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 1) { + throw new UDFArgumentLengthException("normalize() has an only single argument."); + } + + switch (arguments[0].getCategory()) { + case LIST: + ObjectInspector elmOI = ((ListObjectInspector) arguments[0]).getListElementObjectInspector(); + if (elmOI.getCategory().equals(Category.PRIMITIVE)) { + if (((PrimitiveObjectInspector) elmOI).getPrimitiveCategory() == PrimitiveCategory.STRING) { + break; + } + } + default: + throw new UDFArgumentTypeException(0, + "normalize() must have List[String] as an argument, but " + + arguments[0].getTypeName() + " was found."); + } + + // Create a ObjectInspector converter for arguments + ObjectInspector outputElemOI = ObjectInspectorFactory.getReflectionObjectInspector( + Text.class, ObjectInspectorOptions.JAVA); + ObjectInspector outputOI = ObjectInspectorFactory.getStandardListObjectInspector(outputElemOI); + toListText = ObjectInspectorConverters.getConverter(arguments[0], outputOI); + + ObjectInspector listElemOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ObjectInspector returnElemOI = ObjectInspectorUtils.getStandardObjectInspector(listElemOI); + return ObjectInspectorFactory.getStandardListObjectInspector(returnElemOI); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert (arguments.length == 1); + @SuppressWarnings("unchecked") + final List<Text> input = (List<Text>) toListText.convert(arguments[0].get()); + retValue = udf.evaluate(input); + return retValue; + } + + @Override + public String getDisplayString(String[] children) { + return "normalize(" + Arrays.toString(children) + ")"; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/src/main/java/hivemall/knn/lsh/MinHashesUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/common/src/main/java/hivemall/knn/lsh/MinHashesUDFWrapper.java b/spark/common/src/main/java/hivemall/knn/lsh/MinHashesUDFWrapper.java new file mode 100644 index 0000000..d3bcbe6 --- /dev/null +++ b/spark/common/src/main/java/hivemall/knn/lsh/MinHashesUDFWrapper.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.knn.lsh; + +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.*; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; + +/** A wrapper of [[hivemall.knn.lsh.MinHashesUDF]]. */ +@Description( + name = "minhashes", + value = "_FUNC_(features in array<string>, noWeight in boolean) - Returns hashed features as array<int>") +@UDFType(deterministic = true, stateful = false) +public class MinHashesUDFWrapper extends GenericUDF { + private MinHashesUDF udf = new MinHashesUDF(); + private ListObjectInspector featuresOI = null; + private PrimitiveObjectInspector noWeightOI = null; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 2) { + throw new UDFArgumentLengthException( + "minhashes() has 2 arguments: array<string> features, boolean noWeight"); + } + + // Check argument types + switch (arguments[0].getCategory()) { + case LIST: + featuresOI = (ListObjectInspector) arguments[0]; + ObjectInspector elmOI = featuresOI.getListElementObjectInspector(); + if (elmOI.getCategory().equals(Category.PRIMITIVE)) { + if (((PrimitiveObjectInspector) elmOI).getPrimitiveCategory() == PrimitiveCategory.STRING) { + break; + } + } + default: + throw new UDFArgumentTypeException(0, "Type mismatch: features"); + } + + noWeightOI = (PrimitiveObjectInspector) arguments[1]; + if (noWeightOI.getPrimitiveCategory() != PrimitiveCategory.BOOLEAN) { + throw new UDFArgumentException("Type mismatch: noWeight"); + } + + return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.INT)); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert (arguments.length == 2); + @SuppressWarnings("unchecked") + final List<String> features = (List<String>) featuresOI.getList(arguments[0].get()); + final Boolean noWeight = PrimitiveObjectInspectorUtils.getBoolean(arguments[1].get(), + noWeightOI); + return udf.evaluate(features, noWeight); + } + + @Override + public String getDisplayString(String[] children) { + /** + * TODO: Need to return hive-specific type names. + */ + return "minhashes(" + Arrays.toString(children) + ")"; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/src/main/java/hivemall/tools/mapred/RowIdUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/common/src/main/java/hivemall/tools/mapred/RowIdUDFWrapper.java b/spark/common/src/main/java/hivemall/tools/mapred/RowIdUDFWrapper.java new file mode 100644 index 0000000..f386223 --- /dev/null +++ b/spark/common/src/main/java/hivemall/tools/mapred/RowIdUDFWrapper.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.mapred; + +import java.util.UUID; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +/** An alternative implementation of [[hivemall.tools.mapred.RowIdUDF]]. */ +@Description( + name = "rowid", + value = "_FUNC_() - Returns a generated row id of a form {TASK_ID}-{UUID}-{SEQUENCE_NUMBER}") +@UDFType(deterministic = false, stateful = true) +public class RowIdUDFWrapper extends GenericUDF { + // RowIdUDF is directly used because spark cannot + // handle HadoopUtils#getTaskId(). + + private long sequence; + private long taskId; + + public RowIdUDFWrapper() { + this.sequence = 0L; + this.taskId = Thread.currentThread().getId(); + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 0) { + throw new UDFArgumentLengthException("row_number() has no argument."); + } + + return PrimitiveObjectInspectorFactory.javaStringObjectInspector; + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert (arguments.length == 0); + sequence++; + /** + * TODO: Check if it is unique over all tasks in executors of Spark. + */ + return taskId + "-" + UUID.randomUUID() + "-" + sequence; + } + + @Override + public String getDisplayString(String[] children) { + return "row_number()"; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/src/main/scala/hivemall/HivemallException.scala ---------------------------------------------------------------------- diff --git a/spark/common/src/main/scala/hivemall/HivemallException.scala b/spark/common/src/main/scala/hivemall/HivemallException.scala new file mode 100644 index 0000000..53f6756 --- /dev/null +++ b/spark/common/src/main/scala/hivemall/HivemallException.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall + +class HivemallException(message: String, cause: Throwable) + extends Exception(message, cause) { + + def this(message: String) = this(message, null) +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/common/src/main/scala/org/apache/spark/ml/feature/HivemallLabeledPoint.scala ---------------------------------------------------------------------- diff --git a/spark/common/src/main/scala/org/apache/spark/ml/feature/HivemallLabeledPoint.scala b/spark/common/src/main/scala/org/apache/spark/ml/feature/HivemallLabeledPoint.scala new file mode 100644 index 0000000..3fb2d18 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/ml/feature/HivemallLabeledPoint.scala @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.ml.feature + +import java.util.StringTokenizer + +import scala.collection.mutable.ListBuffer + +import hivemall.HivemallException + +// Used for DataFrame#explode +case class HivemallFeature(feature: String) + +/** + * Class that represents the features and labels of a data point for Hivemall. + * + * @param label Label for this data point. + * @param features List of features for this data point. + */ +case class HivemallLabeledPoint(label: Float = 0.0f, features: Seq[String]) { + override def toString: String = { + "%s,%s".format(label, features.mkString("[", ",", "]")) + } +} + +object HivemallLabeledPoint { + + // Simple parser for HivemallLabeledPoint + def parse(s: String): HivemallLabeledPoint = { + val (label, features) = s.indexOf(',') match { + case d if d > 0 => (s.substring(0, d), s.substring(d + 1)) + case _ => ("0.0", "[]") // Dummy + } + HivemallLabeledPoint(label.toFloat, parseTuple(new StringTokenizer(features, "[],", true))) + } + + // TODO: Support to parse rows without labels + private[this] def parseTuple(tokenizer: StringTokenizer): Seq[String] = { + val items = ListBuffer.empty[String] + var parsing = true + var allowDelim = false + while (parsing && tokenizer.hasMoreTokens()) { + val token = tokenizer.nextToken() + if (token == "[") { + items ++= parseTuple(tokenizer) + parsing = false + allowDelim = true + } else if (token == ",") { + if (allowDelim) { + allowDelim = false + } else { + throw new HivemallException("Found ',' at a wrong position.") + } + } else if (token == "]") { + parsing = false + } else { + items.append(token) + allowDelim = true + } + } + if (parsing) { + throw new HivemallException(s"A tuple must end with ']'.") + } + items + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/pom.xml ---------------------------------------------------------------------- diff --git a/spark/pom.xml b/spark/pom.xml new file mode 100644 index 0000000..7db85ab --- /dev/null +++ b/spark/pom.xml @@ -0,0 +1,159 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall</artifactId> + <version>0.5.1-incubating-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + + <artifactId>hivemall-spark</artifactId> + <packaging>pom</packaging> + <name>Hivemall on Apache Spark</name> + + <modules> + <module>common</module> + <module>spark-2.0</module> + <module>spark-2.1</module> + <module>spark-2.2</module> + </modules> + + <properties> + <main.basedir>${project.parent.basedir}</main.basedir> + <scala.version>2.11.8</scala.version> + <scala.binary.version>2.11</scala.binary.version> + </properties> + + <dependencyManagement> + <dependencies> + <!-- compile scope --> + <dependency> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall-core</artifactId> + <version>${project.version}</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall-xgboost</artifactId> + <version>${project.version}</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-compress</artifactId> + <version>1.8</version> + <scope>compile</scope> + </dependency> + + <!-- provided scope --> + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-library</artifactId> + <version>${scala.version}</version> + <scope>provided</scope> + </dependency> + + <!-- test dependencies --> + <dependency> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall-mixserv</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.xerial</groupId> + <artifactId>xerial-core</artifactId> + <version>3.2.3</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_${scala.binary.version}</artifactId> + <version>2.2.4</version> + <scope>test</scope> + </dependency> + </dependencies> + </dependencyManagement> + + <build> + <directory>target</directory> + <outputDirectory>target/classes</outputDirectory> + <finalName>${project.artifactId}-${project.version}</finalName> + <testOutputDirectory>target/test-classes</testOutputDirectory> + + <pluginManagement> + <plugins> + <plugin> + <groupId>net.alchim31.maven</groupId> + <artifactId>scala-maven-plugin</artifactId> + <version>3.2.2</version> + </plugin> + </plugins> + </pluginManagement> + + <plugins> + <plugin> + <groupId>net.alchim31.maven</groupId> + <artifactId>scala-maven-plugin</artifactId> + <executions> + <execution> + <id>scala-compile-first</id> + <phase>process-resources</phase> + <goals> + <goal>add-source</goal> + <goal>compile</goal> + </goals> + </execution> + <execution> + <id>scala-test-compile</id> + <phase>process-test-resources</phase> + <goals> + <goal>testCompile</goal> + </goals> + </execution> + </executions> + <!-- For incremental compilation --> + <configuration> + <scalaVersion>${scala.version}</scalaVersion> + <recompileMode>incremental</recompileMode> + <useZincServer>true</useZincServer> + <args> + <arg>-unchecked</arg> + <arg>-deprecation</arg> + <!-- TODO: To enable this option, we need to fix many wornings --> + <!-- <arg>-feature</arg> --> + </args> + <jvmArgs> + <jvmArg>-Xms512m</jvmArg> + <jvmArg>-Xmx1024m</jvmArg> + <jvmArg>-XX:PermSize=64m</jvmArg> + <jvmArg>-XX:MaxPermSize=512m</jvmArg> + <jvmArg>-XX:ReservedCodeCacheSize=512m</jvmArg> + </jvmArgs> + </configuration> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/scalastyle-config.xml ---------------------------------------------------------------------- diff --git a/spark/scalastyle-config.xml b/spark/scalastyle-config.xml new file mode 100644 index 0000000..13d1c47 --- /dev/null +++ b/spark/scalastyle-config.xml @@ -0,0 +1,333 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<!-- +If you wish to turn off checking for a section of code, you can put a comment in the source +before and after the section, with the following syntax: + + // scalastyle:off + ... // stuff that breaks the styles + // scalastyle:on + +You can also disable only one rule, by specifying its rule id, as specified in: + http://www.scalastyle.org/rules-0.7.0.html + + // scalastyle:off no.finalize + override def finalize(): Unit = ... + // scalastyle:on no.finalize + +This file is divided into 3 sections: + (1) rules that we enforce. + (2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet + (or we need to make the scalastyle rule more configurable). + (3) rules that we don't want to enforce. +--> + +<scalastyle> + <name>Scalastyle standard configuration</name> + + <!-- ================================================================================ --> + <!-- rules we enforce --> + <!-- ================================================================================ --> + + <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true"> + <parameters> + <parameter name="header"><![CDATA[/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */]]></parameter> + </parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true"> + <parameters> + <parameter name="maxLineLength"><![CDATA[100]]></parameter> + <parameter name="tabSize"><![CDATA[2]]></parameter> + <parameter name="ignoreImports">true</parameter> + </parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true"> + <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true"> + <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true"> + <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true"> + <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true"> + <parameters> + <parameter name="singleLineAllowed"><![CDATA[true]]></parameter> + <parameter name="doubleLineAllowed"><![CDATA[true]]></parameter> + </parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check> + + <check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true"> + <parameters> + <parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter> + </parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true"> + <parameters> + <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter> + </parameters> + </check> + + <!-- ??? usually shouldn't be checked into the code base. --> + <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check> + + <!-- As of SPARK-7977 all printlns need to be wrapped in '// scalastyle:off/on println' --> + <check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> + <parameters><parameter name="regex">^println$</parameter></parameters> + <customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with + // scalastyle:off println + println(...) + // scalastyle:on println]]></customMessage> + </check> + + <check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">@VisibleForTesting</parameter></parameters> + <customMessage><![CDATA[ + @VisibleForTesting causes classpath issues. Please note this in the java doc instead (SPARK-11615). + ]]></customMessage> + </check> + + <check customId="runtimeaddshutdownhook" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">Runtime\.getRuntime\.addShutdownHook</parameter></parameters> + <customMessage><![CDATA[ + Are you sure that you want to use Runtime.getRuntime.addShutdownHook? In most cases, you should use + ShutdownHookManager.addShutdownHook instead. + If you must use Runtime.getRuntime.addShutdownHook, wrap the code block with + // scalastyle:off runtimeaddshutdownhook + Runtime.getRuntime.addShutdownHook(...) + // scalastyle:on runtimeaddshutdownhook + ]]></customMessage> + </check> + + <check customId="mutablesynchronizedbuffer" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">mutable\.SynchronizedBuffer</parameter></parameters> + <customMessage><![CDATA[ + Are you sure that you want to use mutable.SynchronizedBuffer? In most cases, you should use + java.util.concurrent.ConcurrentLinkedQueue instead. + If you must use mutable.SynchronizedBuffer, wrap the code block with + // scalastyle:off mutablesynchronizedbuffer + mutable.SynchronizedBuffer[...] + // scalastyle:on mutablesynchronizedbuffer + ]]></customMessage> + </check> + + <check customId="classforname" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">Class\.forName</parameter></parameters> + <customMessage><![CDATA[ + Are you sure that you want to use Class.forName? In most cases, you should use Utils.classForName instead. + If you must use Class.forName, wrap the code block with + // scalastyle:off classforname + Class.forName(...) + // scalastyle:on classforname + ]]></customMessage> + </check> + + <check customId="awaitresult" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">Await\.result</parameter></parameters> + <customMessage><![CDATA[ + Are you sure that you want to use Await.result? In most cases, you should use ThreadUtils.awaitResult instead. + If you must use Await.result, wrap the code block with + // scalastyle:off awaitresult + Await.result(...) + // scalastyle:on awaitresult + ]]></customMessage> + </check> + + <!-- As of SPARK-9613 JavaConversions should be replaced with JavaConverters --> + <check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> + <parameters><parameter name="regex">JavaConversions</parameter></parameters> + <customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import + scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage> + </check> + + <check customId="commonslang2" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> + <parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters> + <customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead + of Commons Lang 2 (package org.apache.commons.lang.*)</customMessage> + </check> + + <check level="error" class="org.scalastyle.scalariform.ImportOrderChecker" enabled="true"> + <parameters> + <parameter name="groups">java,scala,3rdParty,spark</parameter> + <parameter name="group.java">javax?\..*</parameter> + <parameter name="group.scala">scala\..*</parameter> + <parameter name="group.3rdParty">(?!org\.apache\.spark\.).*</parameter> + <parameter name="group.spark">org\.apache\.spark\..*</parameter> + </parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" enabled="true"> + <parameters> + <parameter name="tokens">COMMA</parameter> + </parameters> + </check> + + <!-- SPARK-3854: Single Space between ')' and '{' --> + <check customId="SingleSpaceBetweenRParenAndLCurlyBrace" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">\)\{</parameter></parameters> + <customMessage><![CDATA[ + Single Space between ')' and `{`. + ]]></customMessage> + </check> + + <check customId="NoScalaDoc" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">(?m)^(\s*)/[*][*].*$(\r|)\n^\1 [*]</parameter></parameters> + <customMessage>Use Javadoc style indentation for multiline comments</customMessage> + </check> + + <check customId="OmitBracesInCase" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">case[^\n>]*=>\s*\{</parameter></parameters> + <customMessage>Omit braces in case clauses.</customMessage> + </check> + + <!-- SPARK-16877: Avoid Java annotations --> + <check customId="OverrideJavaCase" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> + <parameters><parameter name="regex">^Override$</parameter></parameters> + <customMessage>override modifier should be used instead of @java.lang.Override.</customMessage> + </check> + + <check level="error" class="org.scalastyle.scalariform.DeprecatedJavaChecker" enabled="true"></check> + + <!-- ================================================================================ --> + <!-- rules we'd like to enforce, but haven't cleaned up the codebase yet --> + <!-- ================================================================================ --> + + <!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. --> + <!-- Ideally the following two rules should be configurable to rule out string interpolation. --> + <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check> + <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check> + + <!-- This breaks symbolic method names so we don't turn it on. --> + <!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. --> + <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false"> + <parameters> + <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter> + </parameters> + </check> + + <!-- Should turn this on, but we have a few places that need to be fixed first --> + <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check> + + <!-- ================================================================================ --> + <!-- rules we don't want --> + <!-- ================================================================================ --> + + <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false"> + <parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters> + </check> + + <!-- We want the opposite of this: NewLineAtEofChecker --> + <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check> + + <!-- This one complains about all kinds of random things. Disable. --> + <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check> + + <!-- We use return quite a bit for control flows and guards --> + <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check> + + <!-- We use null a lot in low level code and to interface with 3rd party code --> + <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check> + + <!-- Doesn't seem super big deal here ... --> + <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"></check> + + <!-- Doesn't seem super big deal here ... --> + <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false"> + <parameters><parameter name="maxFileLength">800></parameter></parameters> + </check> + + <!-- Doesn't seem super big deal here ... --> + <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false"> + <parameters><parameter name="maxTypes">30</parameter></parameters> + </check> + + <!-- Doesn't seem super big deal here ... --> + <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false"> + <parameters><parameter name="maximum">10</parameter></parameters> + </check> + + <!-- Doesn't seem super big deal here ... --> + <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false"> + <parameters><parameter name="maxLength">50</parameter></parameters> + </check> + + <!-- Not exactly feasible to enforce this right now. --> + <!-- It is also infrequent that somebody introduces a new class with a lot of methods. --> + <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false"> + <parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters> + </check> + + <!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... --> + <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false"> + <parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters> + </check> + +</scalastyle>
