http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-2.0/pom.xml ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/pom.xml b/spark/spark-2.0/pom.xml index fdaccd5..3d82f89 100644 --- a/spark/spark-2.0/pom.xml +++ b/spark/spark-2.0/pom.xml @@ -22,31 +22,31 @@ <parent> <groupId>org.apache.hivemall</groupId> - <artifactId>hivemall</artifactId> + <artifactId>hivemall-spark</artifactId> <version>0.5.1-incubating-SNAPSHOT</version> - <relativePath>../../pom.xml</relativePath> + <relativePath>../pom.xml</relativePath> </parent> - <artifactId>hivemall-spark</artifactId> + <artifactId>hivemall-spark_2.0</artifactId> <name>Hivemall on Spark 2.0</name> <packaging>jar</packaging> <properties> - <main.basedir>${project.parent.basedir}</main.basedir> + <main.basedir>${project.parent.parent.basedir}</main.basedir> + <spark.version>2.0.2</spark.version> + <spark.binary.version>2.0</spark.binary.version> </properties> <dependencies> - <!-- hivemall dependencies --> + <!-- compile scope --> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-core</artifactId> - <version>${project.version}</version> <scope>compile</scope> </dependency> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-xgboost</artifactId> - <version>${project.version}</version> <scope>compile</scope> </dependency> <dependency> @@ -55,22 +55,18 @@ <version>${project.version}</version> <scope>compile</scope> </dependency> - - <!-- third-party dependencies --> - <dependency> - <groupId>org.scala-lang</groupId> - <artifactId>scala-library</artifactId> - <version>${scala.version}</version> - <scope>compile</scope> - </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-compress</artifactId> - <version>1.8</version> <scope>compile</scope> </dependency> - <!-- other provided dependencies --> + <!-- provided scope --> + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-library</artifactId> + <scope>provided</scope> + </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_${scala.binary.version}</artifactId> @@ -106,81 +102,26 @@ <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-mixserv</artifactId> - <version>${project.version}</version> <scope>test</scope> </dependency> <dependency> <groupId>org.xerial</groupId> <artifactId>xerial-core</artifactId> - <version>3.2.3</version> <scope>test</scope> </dependency> <dependency> <groupId>org.scalatest</groupId> <artifactId>scalatest_${scala.binary.version}</artifactId> - <version>2.2.4</version> <scope>test</scope> </dependency> </dependencies> <build> - <directory>target</directory> - <outputDirectory>target/classes</outputDirectory> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}</finalName> - <testOutputDirectory>target/test-classes</testOutputDirectory> <plugins> - <!-- For incremental compilation --> - <plugin> - <groupId>net.alchim31.maven</groupId> - <artifactId>scala-maven-plugin</artifactId> - <version>3.2.2</version> - <executions> - <execution> - <id>scala-compile-first</id> - <phase>process-resources</phase> - <goals> - <goal>compile</goal> - </goals> - </execution> - <execution> - <id>scala-test-compile-first</id> - <phase>process-test-resources</phase> - <goals> - <goal>testCompile</goal> - </goals> - </execution> - </executions> - <configuration> - <scalaVersion>${scala.version}</scalaVersion> - <recompileMode>incremental</recompileMode> - <useZincServer>true</useZincServer> - <args> - <arg>-unchecked</arg> - <arg>-deprecation</arg> - <!-- TODO: To enable this option, we need to fix many wornings --> - <!-- <arg>-feature</arg> --> - </args> - <jvmArgs> - <jvmArg>-Xms512m</jvmArg> - <jvmArg>-Xmx1024m</jvmArg> - </jvmArgs> - </configuration> - </plugin> - <!-- hivemall-spark_xx-xx.jar --> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <version>2.5</version> - <configuration> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> - </configuration> - </plugin> <!-- hivemall-spark_xx-xx-with-dependencies.jar including minimum dependencies --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> - <version>3.1.0</version> <executions> <execution> <id>jar-with-dependencies</id> @@ -189,8 +130,8 @@ <goal>shade</goal> </goals> <configuration> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}-with-dependencies</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> + <finalName>${project.artifactId}-${project.version}-with-dependencies</finalName> + <outputDirectory>${project.parent.parent.build.directory}</outputDirectory> <minimizeJar>false</minimizeJar> <createDependencyReducedPom>false</createDependencyReducedPom> <artifactSet> @@ -213,7 +154,6 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> - <version>2.7</version> <configuration> <skipTests>true</skipTests> </configuration> @@ -234,6 +174,7 @@ <SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION> <SPARK_TESTING>1</SPARK_TESTING> <JAVA_HOME>${env.JAVA_HOME}</JAVA_HOME> + <PATH>${env.JAVA_HOME}/bin:${env.PATH}</PATH> </environmentVariables> <systemProperties> <log4j.configuration>file:src/test/resources/log4j.properties</log4j.configuration>
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-2.0/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala b/spark/spark-2.0/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala new file mode 100644 index 0000000..a6bbb4b --- /dev/null +++ b/spark/spark-2.0/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.streaming + +import scala.reflect.ClassTag + +import org.apache.spark.ml.feature.HivemallLabeledPoint +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.apache.spark.streaming.dstream.DStream + +final class HivemallStreamingOps(ds: DStream[HivemallLabeledPoint]) { + + def predict[U: ClassTag](f: DataFrame => DataFrame)(implicit sqlContext: SQLContext) + : DStream[Row] = { + ds.transform[Row] { rdd: RDD[HivemallLabeledPoint] => + f(sqlContext.createDataFrame(rdd)).rdd + } + } +} + +object HivemallStreamingOps { + + /** + * Implicitly inject the [[HivemallStreamingOps]] into [[DStream]]. + */ + implicit def dataFrameToHivemallStreamingOps(ds: DStream[HivemallLabeledPoint]) + : HivemallStreamingOps = { + new HivemallStreamingOps(ds) + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-2.1/pom.xml ---------------------------------------------------------------------- diff --git a/spark/spark-2.1/pom.xml b/spark/spark-2.1/pom.xml index 0f127a7..e1da779 100644 --- a/spark/spark-2.1/pom.xml +++ b/spark/spark-2.1/pom.xml @@ -22,17 +22,19 @@ <parent> <groupId>org.apache.hivemall</groupId> - <artifactId>hivemall</artifactId> + <artifactId>hivemall-spark</artifactId> <version>0.5.1-incubating-SNAPSHOT</version> - <relativePath>../../pom.xml</relativePath> + <relativePath>../pom.xml</relativePath> </parent> - <artifactId>hivemall-spark</artifactId> + <artifactId>hivemall-spark_2.1</artifactId> <name>Hivemall on Spark 2.1</name> <packaging>jar</packaging> <properties> - <main.basedir>${project.parent.basedir}</main.basedir> + <main.basedir>${project.parent.parent.basedir}</main.basedir> + <spark.version>2.1.1</spark.version> + <spark.binary.version>2.1</spark.binary.version> </properties> <dependencies> @@ -40,13 +42,11 @@ <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-core</artifactId> - <version>${project.version}</version> <scope>compile</scope> </dependency> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-xgboost</artifactId> - <version>${project.version}</version> <scope>compile</scope> </dependency> <dependency> @@ -55,22 +55,18 @@ <version>${project.version}</version> <scope>compile</scope> </dependency> - - <!-- third-party dependencies --> - <dependency> - <groupId>org.scala-lang</groupId> - <artifactId>scala-library</artifactId> - <version>${scala.version}</version> - <scope>compile</scope> - </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-compress</artifactId> - <version>1.8</version> <scope>compile</scope> </dependency> - <!-- other provided dependencies --> + <!-- provided scope --> + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-library</artifactId> + <scope>provided</scope> + </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_${scala.binary.version}</artifactId> @@ -106,81 +102,26 @@ <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-mixserv</artifactId> - <version>${project.version}</version> <scope>test</scope> </dependency> <dependency> <groupId>org.xerial</groupId> <artifactId>xerial-core</artifactId> - <version>3.2.3</version> <scope>test</scope> </dependency> <dependency> <groupId>org.scalatest</groupId> <artifactId>scalatest_${scala.binary.version}</artifactId> - <version>2.2.4</version> <scope>test</scope> </dependency> </dependencies> <build> - <directory>target</directory> - <outputDirectory>target/classes</outputDirectory> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}</finalName> - <testOutputDirectory>target/test-classes</testOutputDirectory> <plugins> - <!-- For incremental compilation --> - <plugin> - <groupId>net.alchim31.maven</groupId> - <artifactId>scala-maven-plugin</artifactId> - <version>3.2.2</version> - <executions> - <execution> - <id>scala-compile-first</id> - <phase>process-resources</phase> - <goals> - <goal>compile</goal> - </goals> - </execution> - <execution> - <id>scala-test-compile-first</id> - <phase>process-test-resources</phase> - <goals> - <goal>testCompile</goal> - </goals> - </execution> - </executions> - <configuration> - <scalaVersion>${scala.version}</scalaVersion> - <recompileMode>incremental</recompileMode> - <useZincServer>true</useZincServer> - <args> - <arg>-unchecked</arg> - <arg>-deprecation</arg> - <!-- TODO: To enable this option, we need to fix many wornings --> - <!-- <arg>-feature</arg> --> - </args> - <jvmArgs> - <jvmArg>-Xms512m</jvmArg> - <jvmArg>-Xmx1024m</jvmArg> - </jvmArgs> - </configuration> - </plugin> - <!-- hivemall-spark_xx-xx.jar --> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <version>2.5</version> - <configuration> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> - </configuration> - </plugin> <!-- hivemall-spark_xx-xx-with-dependencies.jar including minimum dependencies --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> - <version>3.1.0</version> <executions> <execution> <id>jar-with-dependencies</id> @@ -189,8 +130,8 @@ <goal>shade</goal> </goals> <configuration> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}-with-dependencies</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> + <finalName>${project.artifactId}-${project.version}-with-dependencies</finalName> + <outputDirectory>${project.parent.parent.build.directory}</outputDirectory> <minimizeJar>false</minimizeJar> <createDependencyReducedPom>false</createDependencyReducedPom> <artifactSet> @@ -213,7 +154,6 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> - <version>2.7</version> <configuration> <skipTests>true</skipTests> </configuration> @@ -234,6 +174,7 @@ <SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION> <SPARK_TESTING>1</SPARK_TESTING> <JAVA_HOME>${env.JAVA_HOME}</JAVA_HOME> + <PATH>${env.JAVA_HOME}/bin:${env.PATH}</PATH> </environmentVariables> <systemProperties> <log4j.configuration>file:src/test/resources/log4j.properties</log4j.configuration> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-2.1/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.1/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala b/spark/spark-2.1/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala new file mode 100644 index 0000000..a6bbb4b --- /dev/null +++ b/spark/spark-2.1/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.streaming + +import scala.reflect.ClassTag + +import org.apache.spark.ml.feature.HivemallLabeledPoint +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.apache.spark.streaming.dstream.DStream + +final class HivemallStreamingOps(ds: DStream[HivemallLabeledPoint]) { + + def predict[U: ClassTag](f: DataFrame => DataFrame)(implicit sqlContext: SQLContext) + : DStream[Row] = { + ds.transform[Row] { rdd: RDD[HivemallLabeledPoint] => + f(sqlContext.createDataFrame(rdd)).rdd + } + } +} + +object HivemallStreamingOps { + + /** + * Implicitly inject the [[HivemallStreamingOps]] into [[DStream]]. + */ + implicit def dataFrameToHivemallStreamingOps(ds: DStream[HivemallLabeledPoint]) + : HivemallStreamingOps = { + new HivemallStreamingOps(ds) + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-2.2/pom.xml ---------------------------------------------------------------------- diff --git a/spark/spark-2.2/pom.xml b/spark/spark-2.2/pom.xml index 8a5ed00..1d9e3c8 100644 --- a/spark/spark-2.2/pom.xml +++ b/spark/spark-2.2/pom.xml @@ -22,34 +22,35 @@ <parent> <groupId>org.apache.hivemall</groupId> - <artifactId>hivemall</artifactId> + <artifactId>hivemall-spark</artifactId> <version>0.5.1-incubating-SNAPSHOT</version> - <relativePath>../../pom.xml</relativePath> + <relativePath>../pom.xml</relativePath> </parent> - <artifactId>hivemall-spark</artifactId> + <artifactId>hivemall-spark_2.2</artifactId> <name>Hivemall on Spark 2.2</name> <packaging>jar</packaging> <properties> - <PermGen>64m</PermGen> - <MaxPermGen>512m</MaxPermGen> - <CodeCacheSize>512m</CodeCacheSize> - <main.basedir>${project.parent.basedir}</main.basedir> + <main.basedir>${project.parent.parent.basedir}</main.basedir> + <spark.version>2.2.0</spark.version> + <spark.binary.version>2.2</spark.binary.version> + <java.source.version>1.8</java.source.version> + <java.target.version>1.8</java.target.version> + <maven.compiler.source>1.8</maven.compiler.source> + <maven.compiler.target>1.8</maven.compiler.target> </properties> <dependencies> - <!-- hivemall dependencies --> + <!-- compile scope --> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-core</artifactId> - <version>${project.version}</version> <scope>compile</scope> </dependency> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-xgboost</artifactId> - <version>${project.version}</version> <scope>compile</scope> </dependency> <dependency> @@ -58,22 +59,18 @@ <version>${project.version}</version> <scope>compile</scope> </dependency> - - <!-- third-party dependencies --> - <dependency> - <groupId>org.scala-lang</groupId> - <artifactId>scala-library</artifactId> - <version>${scala.version}</version> - <scope>compile</scope> - </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-compress</artifactId> - <version>1.8</version> <scope>compile</scope> </dependency> - <!-- other provided dependencies --> + <!-- provided scope --> + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-library</artifactId> + <scope>provided</scope> + </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_${scala.binary.version}</artifactId> @@ -109,84 +106,26 @@ <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-mixserv</artifactId> - <version>${project.version}</version> <scope>test</scope> </dependency> <dependency> <groupId>org.xerial</groupId> <artifactId>xerial-core</artifactId> - <version>3.2.3</version> <scope>test</scope> </dependency> <dependency> <groupId>org.scalatest</groupId> <artifactId>scalatest_${scala.binary.version}</artifactId> - <version>2.2.4</version> <scope>test</scope> </dependency> </dependencies> <build> - <directory>target</directory> - <outputDirectory>target/classes</outputDirectory> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}</finalName> - <testOutputDirectory>target/test-classes</testOutputDirectory> <plugins> - <!-- For incremental compilation --> - <plugin> - <groupId>net.alchim31.maven</groupId> - <artifactId>scala-maven-plugin</artifactId> - <version>3.2.2</version> - <executions> - <execution> - <id>scala-compile-first</id> - <phase>process-resources</phase> - <goals> - <goal>compile</goal> - </goals> - </execution> - <execution> - <id>scala-test-compile-first</id> - <phase>process-test-resources</phase> - <goals> - <goal>testCompile</goal> - </goals> - </execution> - </executions> - <configuration> - <scalaVersion>${scala.version}</scalaVersion> - <recompileMode>incremental</recompileMode> - <useZincServer>true</useZincServer> - <args> - <arg>-unchecked</arg> - <arg>-deprecation</arg> - <!-- TODO: To enable this option, we need to fix many wornings --> - <!-- <arg>-feature</arg> --> - </args> - <jvmArgs> - <jvmArg>-Xms1024m</jvmArg> - <jvmArg>-Xmx1024m</jvmArg> - <jvmArg>-XX:PermSize=${PermGen}</jvmArg> - <jvmArg>-XX:MaxPermSize=${MaxPermGen}</jvmArg> - <jvmArg>-XX:ReservedCodeCacheSize=${CodeCacheSize}</jvmArg> - </jvmArgs> - </configuration> - </plugin> - <!-- hivemall-spark_xx-xx.jar --> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <version>2.5</version> - <configuration> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> - </configuration> - </plugin> <!-- hivemall-spark_xx-xx-with-dependencies.jar including minimum dependencies --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> - <version>3.1.0</version> <executions> <execution> <id>jar-with-dependencies</id> @@ -195,8 +134,8 @@ <goal>shade</goal> </goals> <configuration> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}-with-dependencies</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> + <finalName>${project.artifactId}-${project.version}-with-dependencies</finalName> + <outputDirectory>${project.parent.parent.build.directory}</outputDirectory> <minimizeJar>false</minimizeJar> <createDependencyReducedPom>false</createDependencyReducedPom> <artifactSet> @@ -219,7 +158,6 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> - <version>2.7</version> <configuration> <skipTests>true</skipTests> </configuration> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/pom.xml ---------------------------------------------------------------------- diff --git a/spark/spark-common/pom.xml b/spark/spark-common/pom.xml deleted file mode 100644 index 0665c11..0000000 --- a/spark/spark-common/pom.xml +++ /dev/null @@ -1,146 +0,0 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - - <parent> - <groupId>org.apache.hivemall</groupId> - <artifactId>hivemall</artifactId> - <version>0.5.1-incubating-SNAPSHOT</version> - <relativePath>../../pom.xml</relativePath> - </parent> - - <artifactId>hivemall-spark-common</artifactId> - <name>Hivemall on Spark Common</name> - <packaging>jar</packaging> - - <properties> - <main.basedir>${project.parent.basedir}</main.basedir> - </properties> - - <dependencies> - <!-- hivemall dependencies --> - <dependency> - <groupId>org.apache.hivemall</groupId> - <artifactId>hivemall-core</artifactId> - <version>${project.version}</version> - <scope>compile</scope> - </dependency> - - <!-- other provided dependencies --> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-sql_${scala.binary.version}</artifactId> - <version>${spark.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-hive_${scala.binary.version}</artifactId> - <version>${spark.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-streaming_${scala.binary.version}</artifactId> - <version>${spark.version}</version> - <scope>provided</scope> - </dependency> - - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-common</artifactId> - <version>${hadoop.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-mapreduce-client-core</artifactId> - <version>${hadoop.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hive</groupId> - <artifactId>hive-exec</artifactId> - <version>${hive.version}</version> - <scope>provided</scope> - </dependency> - </dependencies> - - <build> - <directory>target</directory> - <outputDirectory>target/classes</outputDirectory> - <finalName>${project.artifactId}-${project.version}</finalName> - <testOutputDirectory>target/test-classes</testOutputDirectory> - <plugins> - <!-- For resolving spark binary incompatibility --> - <plugin> - <artifactId>maven-clean-plugin</artifactId> - <version>3.0.0</version> - <executions> - <execution> - <phase>initialize</phase> - <goals> - <goal>clean</goal> - </goals> - </execution> - </executions> - </plugin> - <!-- For incremental compilation --> - <plugin> - <groupId>net.alchim31.maven</groupId> - <artifactId>scala-maven-plugin</artifactId> - <version>3.2.2</version> - <executions> - <execution> - <id>scala-compile-first</id> - <phase>process-resources</phase> - <goals> - <goal>compile</goal> - </goals> - </execution> - <execution> - <id>scala-test-compile-first</id> - <phase>process-test-resources</phase> - <goals> - <goal>testCompile</goal> - </goals> - </execution> - </executions> - <configuration> - <scalaVersion>${scala.version}</scalaVersion> - <recompileMode>incremental</recompileMode> - <useZincServer>true</useZincServer> - <args> - <arg>-unchecked</arg> - <arg>-deprecation</arg> - <!-- TODO: To enable this option, we need to fix many wornings --> - <!-- <arg>-feature</arg> --> - </args> - <jvmArgs> - <jvmArg>-Xms512m</jvmArg> - <jvmArg>-Xmx1024m</jvmArg> - </jvmArgs> - </configuration> - </plugin> - </plugins> - </build> -</project> - http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/scalastyle-config.xml ---------------------------------------------------------------------- diff --git a/spark/spark-common/scalastyle-config.xml b/spark/spark-common/scalastyle-config.xml deleted file mode 100644 index 13d1c47..0000000 --- a/spark/spark-common/scalastyle-config.xml +++ /dev/null @@ -1,333 +0,0 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<!-- -If you wish to turn off checking for a section of code, you can put a comment in the source -before and after the section, with the following syntax: - - // scalastyle:off - ... // stuff that breaks the styles - // scalastyle:on - -You can also disable only one rule, by specifying its rule id, as specified in: - http://www.scalastyle.org/rules-0.7.0.html - - // scalastyle:off no.finalize - override def finalize(): Unit = ... - // scalastyle:on no.finalize - -This file is divided into 3 sections: - (1) rules that we enforce. - (2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet - (or we need to make the scalastyle rule more configurable). - (3) rules that we don't want to enforce. ---> - -<scalastyle> - <name>Scalastyle standard configuration</name> - - <!-- ================================================================================ --> - <!-- rules we enforce --> - <!-- ================================================================================ --> - - <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true"> - <parameters> - <parameter name="header"><![CDATA[/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */]]></parameter> - </parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true"> - <parameters> - <parameter name="maxLineLength"><![CDATA[100]]></parameter> - <parameter name="tabSize"><![CDATA[2]]></parameter> - <parameter name="ignoreImports">true</parameter> - </parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true"> - <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true"> - <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true"> - <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true"> - <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true"> - <parameters> - <parameter name="singleLineAllowed"><![CDATA[true]]></parameter> - <parameter name="doubleLineAllowed"><![CDATA[true]]></parameter> - </parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check> - - <check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true"> - <parameters> - <parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter> - </parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true"> - <parameters> - <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter> - </parameters> - </check> - - <!-- ??? usually shouldn't be checked into the code base. --> - <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check> - - <!-- As of SPARK-7977 all printlns need to be wrapped in '// scalastyle:off/on println' --> - <check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> - <parameters><parameter name="regex">^println$</parameter></parameters> - <customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with - // scalastyle:off println - println(...) - // scalastyle:on println]]></customMessage> - </check> - - <check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">@VisibleForTesting</parameter></parameters> - <customMessage><![CDATA[ - @VisibleForTesting causes classpath issues. Please note this in the java doc instead (SPARK-11615). - ]]></customMessage> - </check> - - <check customId="runtimeaddshutdownhook" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">Runtime\.getRuntime\.addShutdownHook</parameter></parameters> - <customMessage><![CDATA[ - Are you sure that you want to use Runtime.getRuntime.addShutdownHook? In most cases, you should use - ShutdownHookManager.addShutdownHook instead. - If you must use Runtime.getRuntime.addShutdownHook, wrap the code block with - // scalastyle:off runtimeaddshutdownhook - Runtime.getRuntime.addShutdownHook(...) - // scalastyle:on runtimeaddshutdownhook - ]]></customMessage> - </check> - - <check customId="mutablesynchronizedbuffer" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">mutable\.SynchronizedBuffer</parameter></parameters> - <customMessage><![CDATA[ - Are you sure that you want to use mutable.SynchronizedBuffer? In most cases, you should use - java.util.concurrent.ConcurrentLinkedQueue instead. - If you must use mutable.SynchronizedBuffer, wrap the code block with - // scalastyle:off mutablesynchronizedbuffer - mutable.SynchronizedBuffer[...] - // scalastyle:on mutablesynchronizedbuffer - ]]></customMessage> - </check> - - <check customId="classforname" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">Class\.forName</parameter></parameters> - <customMessage><![CDATA[ - Are you sure that you want to use Class.forName? In most cases, you should use Utils.classForName instead. - If you must use Class.forName, wrap the code block with - // scalastyle:off classforname - Class.forName(...) - // scalastyle:on classforname - ]]></customMessage> - </check> - - <check customId="awaitresult" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">Await\.result</parameter></parameters> - <customMessage><![CDATA[ - Are you sure that you want to use Await.result? In most cases, you should use ThreadUtils.awaitResult instead. - If you must use Await.result, wrap the code block with - // scalastyle:off awaitresult - Await.result(...) - // scalastyle:on awaitresult - ]]></customMessage> - </check> - - <!-- As of SPARK-9613 JavaConversions should be replaced with JavaConverters --> - <check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> - <parameters><parameter name="regex">JavaConversions</parameter></parameters> - <customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import - scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage> - </check> - - <check customId="commonslang2" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> - <parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters> - <customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead - of Commons Lang 2 (package org.apache.commons.lang.*)</customMessage> - </check> - - <check level="error" class="org.scalastyle.scalariform.ImportOrderChecker" enabled="true"> - <parameters> - <parameter name="groups">java,scala,3rdParty,spark</parameter> - <parameter name="group.java">javax?\..*</parameter> - <parameter name="group.scala">scala\..*</parameter> - <parameter name="group.3rdParty">(?!org\.apache\.spark\.).*</parameter> - <parameter name="group.spark">org\.apache\.spark\..*</parameter> - </parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" enabled="true"> - <parameters> - <parameter name="tokens">COMMA</parameter> - </parameters> - </check> - - <!-- SPARK-3854: Single Space between ')' and '{' --> - <check customId="SingleSpaceBetweenRParenAndLCurlyBrace" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">\)\{</parameter></parameters> - <customMessage><![CDATA[ - Single Space between ')' and `{`. - ]]></customMessage> - </check> - - <check customId="NoScalaDoc" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">(?m)^(\s*)/[*][*].*$(\r|)\n^\1 [*]</parameter></parameters> - <customMessage>Use Javadoc style indentation for multiline comments</customMessage> - </check> - - <check customId="OmitBracesInCase" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">case[^\n>]*=>\s*\{</parameter></parameters> - <customMessage>Omit braces in case clauses.</customMessage> - </check> - - <!-- SPARK-16877: Avoid Java annotations --> - <check customId="OverrideJavaCase" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> - <parameters><parameter name="regex">^Override$</parameter></parameters> - <customMessage>override modifier should be used instead of @java.lang.Override.</customMessage> - </check> - - <check level="error" class="org.scalastyle.scalariform.DeprecatedJavaChecker" enabled="true"></check> - - <!-- ================================================================================ --> - <!-- rules we'd like to enforce, but haven't cleaned up the codebase yet --> - <!-- ================================================================================ --> - - <!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. --> - <!-- Ideally the following two rules should be configurable to rule out string interpolation. --> - <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check> - <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check> - - <!-- This breaks symbolic method names so we don't turn it on. --> - <!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. --> - <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false"> - <parameters> - <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter> - </parameters> - </check> - - <!-- Should turn this on, but we have a few places that need to be fixed first --> - <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check> - - <!-- ================================================================================ --> - <!-- rules we don't want --> - <!-- ================================================================================ --> - - <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false"> - <parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters> - </check> - - <!-- We want the opposite of this: NewLineAtEofChecker --> - <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check> - - <!-- This one complains about all kinds of random things. Disable. --> - <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check> - - <!-- We use return quite a bit for control flows and guards --> - <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check> - - <!-- We use null a lot in low level code and to interface with 3rd party code --> - <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check> - - <!-- Doesn't seem super big deal here ... --> - <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"></check> - - <!-- Doesn't seem super big deal here ... --> - <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false"> - <parameters><parameter name="maxFileLength">800></parameter></parameters> - </check> - - <!-- Doesn't seem super big deal here ... --> - <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false"> - <parameters><parameter name="maxTypes">30</parameter></parameters> - </check> - - <!-- Doesn't seem super big deal here ... --> - <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false"> - <parameters><parameter name="maximum">10</parameter></parameters> - </check> - - <!-- Doesn't seem super big deal here ... --> - <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false"> - <parameters><parameter name="maxLength">50</parameter></parameters> - </check> - - <!-- Not exactly feasible to enforce this right now. --> - <!-- It is also infrequent that somebody introduces a new class with a lot of methods. --> - <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false"> - <parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters> - </check> - - <!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... --> - <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false"> - <parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters> - </check> - -</scalastyle> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/src/main/java/hivemall/dataset/LogisticRegressionDataGeneratorUDTFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/spark-common/src/main/java/hivemall/dataset/LogisticRegressionDataGeneratorUDTFWrapper.java b/spark/spark-common/src/main/java/hivemall/dataset/LogisticRegressionDataGeneratorUDTFWrapper.java deleted file mode 100644 index cf10ed7..0000000 --- a/spark/spark-common/src/main/java/hivemall/dataset/LogisticRegressionDataGeneratorUDTFWrapper.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.dataset; - -import hivemall.UDTFWithOptions; - -import java.lang.reflect.Field; -import java.lang.reflect.Method; -import java.util.Random; - -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.Options; -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.generic.Collector; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; - -/** - * A wrapper of [[hivemall.dataset.LogisticRegressionDataGeneratorUDTF]]. This wrapper is needed - * because Spark cannot handle HadoopUtils#getTaskId() correctly. - */ -@Description(name = "lr_datagen", - value = "_FUNC_(options string) - Generates a logistic regression dataset") -public final class LogisticRegressionDataGeneratorUDTFWrapper extends UDTFWithOptions { - private transient LogisticRegressionDataGeneratorUDTF udtf = - new LogisticRegressionDataGeneratorUDTF(); - - @Override - protected Options getOptions() { - Options options = null; - try { - Method m = udtf.getClass().getDeclaredMethod("getOptions"); - m.setAccessible(true); - options = (Options) m.invoke(udtf); - } catch (Exception e) { - e.printStackTrace(); - } - return options; - } - - @SuppressWarnings("all") - @Override - protected CommandLine processOptions(ObjectInspector[] objectInspectors) - throws UDFArgumentException { - CommandLine commands = null; - try { - Method m = udtf.getClass().getDeclaredMethod("processOptions"); - m.setAccessible(true); - commands = (CommandLine) m.invoke(udtf, objectInspectors); - } catch (Exception e) { - e.printStackTrace(); - } - return commands; - } - - @Override - public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { - try { - // Extract a collector for LogisticRegressionDataGeneratorUDTF - Field collector = GenericUDTF.class.getDeclaredField("collector"); - collector.setAccessible(true); - udtf.setCollector((Collector) collector.get(this)); - - // To avoid HadoopUtils#getTaskId() - Class<?> clazz = udtf.getClass(); - Field rnd1 = clazz.getDeclaredField("rnd1"); - Field rnd2 = clazz.getDeclaredField("rnd2"); - Field r_seed = clazz.getDeclaredField("r_seed"); - r_seed.setAccessible(true); - final long seed = r_seed.getLong(udtf) + (int) Thread.currentThread().getId(); - rnd1.setAccessible(true); - rnd2.setAccessible(true); - rnd1.set(udtf, new Random(seed)); - rnd2.set(udtf, new Random(seed + 1)); - } catch (Exception e) { - e.printStackTrace(); - } - return udtf.initialize(argOIs); - } - - @Override - public void process(Object[] objects) throws HiveException { - udtf.process(objects); - } - - @Override - public void close() throws HiveException { - udtf.close(); - } -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/src/main/java/hivemall/ftvec/AddBiasUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/spark-common/src/main/java/hivemall/ftvec/AddBiasUDFWrapper.java b/spark/spark-common/src/main/java/hivemall/ftvec/AddBiasUDFWrapper.java deleted file mode 100644 index b454fd9..0000000 --- a/spark/spark-common/src/main/java/hivemall/ftvec/AddBiasUDFWrapper.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.ftvec; - -import java.util.Arrays; -import java.util.List; - -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.UDFType; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.*; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; - -/** - * A wrapper of [[hivemall.ftvec.AddBiasUDF]]. - * - * NOTE: This is needed to avoid the issue of Spark reflection. That is, spark cannot handle List<> - * as a return type in Hive UDF. Therefore, the type must be passed via ObjectInspector. - */ -@Description(name = "add_bias", - value = "_FUNC_(features in array<string>) - Returns features with a bias as array<string>") -@UDFType(deterministic = true, stateful = false) -public class AddBiasUDFWrapper extends GenericUDF { - private AddBiasUDF udf = new AddBiasUDF(); - private ListObjectInspector argumentOI = null; - - @Override - public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { - if (arguments.length != 1) { - throw new UDFArgumentLengthException( - "add_bias() has an single arguments: array<string> features"); - } - - switch (arguments[0].getCategory()) { - case LIST: - argumentOI = (ListObjectInspector) arguments[0]; - ObjectInspector elmOI = argumentOI.getListElementObjectInspector(); - if (elmOI.getCategory().equals(Category.PRIMITIVE)) { - if (((PrimitiveObjectInspector) elmOI).getPrimitiveCategory() == PrimitiveCategory.STRING) { - break; - } - } - default: - throw new UDFArgumentTypeException(0, "Type mismatch: features"); - } - - return ObjectInspectorFactory.getStandardListObjectInspector(argumentOI.getListElementObjectInspector()); - } - - @Override - public Object evaluate(DeferredObject[] arguments) throws HiveException { - assert (arguments.length == 1); - @SuppressWarnings("unchecked") - final List<String> input = (List<String>) argumentOI.getList(arguments[0].get()); - return udf.evaluate(input); - } - - @Override - public String getDisplayString(String[] children) { - return "add_bias(" + Arrays.toString(children) + ")"; - } -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/src/main/java/hivemall/ftvec/AddFeatureIndexUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/spark-common/src/main/java/hivemall/ftvec/AddFeatureIndexUDFWrapper.java b/spark/spark-common/src/main/java/hivemall/ftvec/AddFeatureIndexUDFWrapper.java deleted file mode 100644 index 0b687db..0000000 --- a/spark/spark-common/src/main/java/hivemall/ftvec/AddFeatureIndexUDFWrapper.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.ftvec; - -import java.util.Arrays; -import java.util.List; - -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.UDFType; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.*; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; - -/** - * A wrapper of [[hivemall.ftvec.AddFeatureIndexUDF]]. - * - * NOTE: This is needed to avoid the issue of Spark reflection. That is, spark cannot handle List<> - * as a return type in Hive UDF. Therefore, the type must be passed via ObjectInspector. - */ -@Description( - name = "add_feature_index", - value = "_FUNC_(dense features in array<double>) - Returns a feature vector with feature indices") -@UDFType(deterministic = true, stateful = false) -public class AddFeatureIndexUDFWrapper extends GenericUDF { - private AddFeatureIndexUDF udf = new AddFeatureIndexUDF(); - private ListObjectInspector argumentOI = null; - - @Override - public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { - if (arguments.length != 1) { - throw new UDFArgumentLengthException( - "add_feature_index() has an single arguments: array<double> features"); - } - - switch (arguments[0].getCategory()) { - case LIST: - argumentOI = (ListObjectInspector) arguments[0]; - ObjectInspector elmOI = argumentOI.getListElementObjectInspector(); - if (elmOI.getCategory().equals(Category.PRIMITIVE)) { - if (((PrimitiveObjectInspector) elmOI).getPrimitiveCategory() == PrimitiveCategory.DOUBLE) { - break; - } - } - default: - throw new UDFArgumentTypeException(0, "Type mismatch: features"); - } - - return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector); - } - - @Override - public Object evaluate(DeferredObject[] arguments) throws HiveException { - assert (arguments.length == 1); - @SuppressWarnings("unchecked") - final List<Double> input = (List<Double>) argumentOI.getList(arguments[0].get()); - return udf.evaluate(input); - } - - @Override - public String getDisplayString(String[] children) { - return "add_feature_index(" + Arrays.toString(children) + ")"; - } -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/src/main/java/hivemall/ftvec/ExtractFeatureUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/spark-common/src/main/java/hivemall/ftvec/ExtractFeatureUDFWrapper.java b/spark/spark-common/src/main/java/hivemall/ftvec/ExtractFeatureUDFWrapper.java deleted file mode 100644 index 5924468..0000000 --- a/spark/spark-common/src/main/java/hivemall/ftvec/ExtractFeatureUDFWrapper.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.ftvec; - -import java.util.Arrays; - -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.UDFType; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.*; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; - -/** - * A wrapper of [[hivemall.ftvec.ExtractFeatureUDF]]. - * - * NOTE: This is needed to avoid the issue of Spark reflection. That is, spark cannot handle List<> - * as a return type in Hive UDF. Therefore, the type must be passed via ObjectInspector. - */ -@Description(name = "extract_feature", - value = "_FUNC_(feature in string) - Returns a parsed feature as string") -@UDFType(deterministic = true, stateful = false) -public class ExtractFeatureUDFWrapper extends GenericUDF { - private ExtractFeatureUDF udf = new ExtractFeatureUDF(); - private PrimitiveObjectInspector argumentOI = null; - - @Override - public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { - if (arguments.length != 1) { - throw new UDFArgumentLengthException( - "extract_feature() has an single arguments: string feature"); - } - - argumentOI = (PrimitiveObjectInspector) arguments[0]; - if (argumentOI.getPrimitiveCategory() != PrimitiveCategory.STRING) { - throw new UDFArgumentTypeException(0, "Type mismatch: feature"); - } - - return PrimitiveObjectInspectorFactory.javaStringObjectInspector; - } - - @Override - public Object evaluate(DeferredObject[] arguments) throws HiveException { - assert (arguments.length == 1); - final String input = (String) argumentOI.getPrimitiveJavaObject(arguments[0].get()); - return udf.evaluate(input); - } - - @Override - public String getDisplayString(String[] children) { - return "extract_feature(" + Arrays.toString(children) + ")"; - } -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/src/main/java/hivemall/ftvec/ExtractWeightUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/spark-common/src/main/java/hivemall/ftvec/ExtractWeightUDFWrapper.java b/spark/spark-common/src/main/java/hivemall/ftvec/ExtractWeightUDFWrapper.java deleted file mode 100644 index 8580247..0000000 --- a/spark/spark-common/src/main/java/hivemall/ftvec/ExtractWeightUDFWrapper.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.ftvec; - -import java.util.Arrays; - -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.UDFType; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.*; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; - -/** - * A wrapper of [[hivemall.ftvec.ExtractWeightUDF]]. - * - * NOTE: This is needed to avoid the issue of Spark reflection. That is, spark cannot handle List<> - * as a return type in Hive UDF. Therefore, the type must be passed via ObjectInspector. - */ -@Description(name = "extract_weight", - value = "_FUNC_(feature in string) - Returns the weight of a feature as string") -@UDFType(deterministic = true, stateful = false) -public class ExtractWeightUDFWrapper extends GenericUDF { - private ExtractWeightUDF udf = new ExtractWeightUDF(); - private PrimitiveObjectInspector argumentOI = null; - - @Override - public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { - if (arguments.length != 1) { - throw new UDFArgumentLengthException( - "extract_weight() has an single arguments: string feature"); - } - - argumentOI = (PrimitiveObjectInspector) arguments[0]; - if (argumentOI.getPrimitiveCategory() != PrimitiveCategory.STRING) { - throw new UDFArgumentTypeException(0, "Type mismatch: feature"); - } - - return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.DOUBLE); - } - - @Override - public Object evaluate(DeferredObject[] arguments) throws HiveException { - assert (arguments.length == 1); - final String input = (String) argumentOI.getPrimitiveJavaObject(arguments[0].get()); - return udf.evaluate(input); - } - - @Override - public String getDisplayString(String[] children) { - return "extract_weight(" + Arrays.toString(children) + ")"; - } -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/src/main/java/hivemall/ftvec/SortByFeatureUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/spark-common/src/main/java/hivemall/ftvec/SortByFeatureUDFWrapper.java b/spark/spark-common/src/main/java/hivemall/ftvec/SortByFeatureUDFWrapper.java deleted file mode 100644 index 584be6c..0000000 --- a/spark/spark-common/src/main/java/hivemall/ftvec/SortByFeatureUDFWrapper.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.ftvec; - -import java.util.Arrays; -import java.util.Map; - -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.UDFType; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.*; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.io.IntWritable; - -/** - * A wrapper of [[hivemall.ftvec.SortByFeatureUDF]]. - * - * NOTE: This is needed to avoid the issue of Spark reflection. That is, spark cannot handle Map<> - * as a return type in Hive UDF. Therefore, the type must be passed via ObjectInspector. - */ -@Description(name = "sort_by_feature", - value = "_FUNC_(map in map<int,float>) - Returns a sorted map") -@UDFType(deterministic = true, stateful = false) -public class SortByFeatureUDFWrapper extends GenericUDF { - private SortByFeatureUDF udf = new SortByFeatureUDF(); - private MapObjectInspector argumentOI = null; - - @Override - public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { - if (arguments.length != 1) { - throw new UDFArgumentLengthException( - "sorted_by_feature() has an single arguments: map<int, float> map"); - } - - switch (arguments[0].getCategory()) { - case MAP: - argumentOI = (MapObjectInspector) arguments[0]; - ObjectInspector keyOI = argumentOI.getMapKeyObjectInspector(); - ObjectInspector valueOI = argumentOI.getMapValueObjectInspector(); - if (keyOI.getCategory().equals(Category.PRIMITIVE) - && valueOI.getCategory().equals(Category.PRIMITIVE)) { - final PrimitiveCategory keyCategory = ((PrimitiveObjectInspector) keyOI).getPrimitiveCategory(); - final PrimitiveCategory valueCategory = ((PrimitiveObjectInspector) valueOI).getPrimitiveCategory(); - if (keyCategory == PrimitiveCategory.INT - && valueCategory == PrimitiveCategory.FLOAT) { - break; - } - } - default: - throw new UDFArgumentTypeException(0, "Type mismatch: map"); - } - - - return ObjectInspectorFactory.getStandardMapObjectInspector( - argumentOI.getMapKeyObjectInspector(), argumentOI.getMapValueObjectInspector()); - } - - @Override - public Object evaluate(DeferredObject[] arguments) throws HiveException { - assert (arguments.length == 1); - @SuppressWarnings("unchecked") - final Map<IntWritable, FloatWritable> input = (Map<IntWritable, FloatWritable>) argumentOI.getMap(arguments[0].get()); - return udf.evaluate(input); - } - - @Override - public String getDisplayString(String[] children) { - return "sort_by_feature(" + Arrays.toString(children) + ")"; - } -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/spark-common/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDFWrapper.java b/spark/spark-common/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDFWrapper.java deleted file mode 100644 index db533be..0000000 --- a/spark/spark-common/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDFWrapper.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.ftvec.scaling; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.*; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.io.Text; - -/** - * A wrapper of [[hivemall.ftvec.scaling.L2NormalizationUDF]]. - * - * NOTE: This is needed to avoid the issue of Spark reflection. That is, spark-1.3 cannot handle - * List<> as a return type in Hive UDF. The type must be passed via ObjectInspector. This issues has - * been reported in SPARK-6747, so a future release of Spark makes the wrapper obsolete. - */ -public class L2NormalizationUDFWrapper extends GenericUDF { - private L2NormalizationUDF udf = new L2NormalizationUDF(); - - private transient List<Text> retValue = new ArrayList<Text>(); - private transient Converter toListText = null; - - @Override - public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { - if (arguments.length != 1) { - throw new UDFArgumentLengthException("normalize() has an only single argument."); - } - - switch (arguments[0].getCategory()) { - case LIST: - ObjectInspector elmOI = ((ListObjectInspector) arguments[0]).getListElementObjectInspector(); - if (elmOI.getCategory().equals(Category.PRIMITIVE)) { - if (((PrimitiveObjectInspector) elmOI).getPrimitiveCategory() == PrimitiveCategory.STRING) { - break; - } - } - default: - throw new UDFArgumentTypeException(0, - "normalize() must have List[String] as an argument, but " - + arguments[0].getTypeName() + " was found."); - } - - // Create a ObjectInspector converter for arguments - ObjectInspector outputElemOI = ObjectInspectorFactory.getReflectionObjectInspector( - Text.class, ObjectInspectorOptions.JAVA); - ObjectInspector outputOI = ObjectInspectorFactory.getStandardListObjectInspector(outputElemOI); - toListText = ObjectInspectorConverters.getConverter(arguments[0], outputOI); - - ObjectInspector listElemOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; - ObjectInspector returnElemOI = ObjectInspectorUtils.getStandardObjectInspector(listElemOI); - return ObjectInspectorFactory.getStandardListObjectInspector(returnElemOI); - } - - @Override - public Object evaluate(DeferredObject[] arguments) throws HiveException { - assert (arguments.length == 1); - @SuppressWarnings("unchecked") - final List<Text> input = (List<Text>) toListText.convert(arguments[0].get()); - retValue = udf.evaluate(input); - return retValue; - } - - @Override - public String getDisplayString(String[] children) { - return "normalize(" + Arrays.toString(children) + ")"; - } -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/src/main/java/hivemall/knn/lsh/MinHashesUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/spark-common/src/main/java/hivemall/knn/lsh/MinHashesUDFWrapper.java b/spark/spark-common/src/main/java/hivemall/knn/lsh/MinHashesUDFWrapper.java deleted file mode 100644 index d3bcbe6..0000000 --- a/spark/spark-common/src/main/java/hivemall/knn/lsh/MinHashesUDFWrapper.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.knn.lsh; - -import java.util.Arrays; -import java.util.List; - -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.UDFType; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.*; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; - -/** A wrapper of [[hivemall.knn.lsh.MinHashesUDF]]. */ -@Description( - name = "minhashes", - value = "_FUNC_(features in array<string>, noWeight in boolean) - Returns hashed features as array<int>") -@UDFType(deterministic = true, stateful = false) -public class MinHashesUDFWrapper extends GenericUDF { - private MinHashesUDF udf = new MinHashesUDF(); - private ListObjectInspector featuresOI = null; - private PrimitiveObjectInspector noWeightOI = null; - - @Override - public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { - if (arguments.length != 2) { - throw new UDFArgumentLengthException( - "minhashes() has 2 arguments: array<string> features, boolean noWeight"); - } - - // Check argument types - switch (arguments[0].getCategory()) { - case LIST: - featuresOI = (ListObjectInspector) arguments[0]; - ObjectInspector elmOI = featuresOI.getListElementObjectInspector(); - if (elmOI.getCategory().equals(Category.PRIMITIVE)) { - if (((PrimitiveObjectInspector) elmOI).getPrimitiveCategory() == PrimitiveCategory.STRING) { - break; - } - } - default: - throw new UDFArgumentTypeException(0, "Type mismatch: features"); - } - - noWeightOI = (PrimitiveObjectInspector) arguments[1]; - if (noWeightOI.getPrimitiveCategory() != PrimitiveCategory.BOOLEAN) { - throw new UDFArgumentException("Type mismatch: noWeight"); - } - - return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.INT)); - } - - @Override - public Object evaluate(DeferredObject[] arguments) throws HiveException { - assert (arguments.length == 2); - @SuppressWarnings("unchecked") - final List<String> features = (List<String>) featuresOI.getList(arguments[0].get()); - final Boolean noWeight = PrimitiveObjectInspectorUtils.getBoolean(arguments[1].get(), - noWeightOI); - return udf.evaluate(features, noWeight); - } - - @Override - public String getDisplayString(String[] children) { - /** - * TODO: Need to return hive-specific type names. - */ - return "minhashes(" + Arrays.toString(children) + ")"; - } -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/src/main/java/hivemall/tools/mapred/RowIdUDFWrapper.java ---------------------------------------------------------------------- diff --git a/spark/spark-common/src/main/java/hivemall/tools/mapred/RowIdUDFWrapper.java b/spark/spark-common/src/main/java/hivemall/tools/mapred/RowIdUDFWrapper.java deleted file mode 100644 index f386223..0000000 --- a/spark/spark-common/src/main/java/hivemall/tools/mapred/RowIdUDFWrapper.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.tools.mapred; - -import java.util.UUID; - -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.UDFType; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; - -/** An alternative implementation of [[hivemall.tools.mapred.RowIdUDF]]. */ -@Description( - name = "rowid", - value = "_FUNC_() - Returns a generated row id of a form {TASK_ID}-{UUID}-{SEQUENCE_NUMBER}") -@UDFType(deterministic = false, stateful = true) -public class RowIdUDFWrapper extends GenericUDF { - // RowIdUDF is directly used because spark cannot - // handle HadoopUtils#getTaskId(). - - private long sequence; - private long taskId; - - public RowIdUDFWrapper() { - this.sequence = 0L; - this.taskId = Thread.currentThread().getId(); - } - - @Override - public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { - if (arguments.length != 0) { - throw new UDFArgumentLengthException("row_number() has no argument."); - } - - return PrimitiveObjectInspectorFactory.javaStringObjectInspector; - } - - @Override - public Object evaluate(DeferredObject[] arguments) throws HiveException { - assert (arguments.length == 0); - sequence++; - /** - * TODO: Check if it is unique over all tasks in executors of Spark. - */ - return taskId + "-" + UUID.randomUUID() + "-" + sequence; - } - - @Override - public String getDisplayString(String[] children) { - return "row_number()"; - } -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/src/main/scala/hivemall/HivemallException.scala ---------------------------------------------------------------------- diff --git a/spark/spark-common/src/main/scala/hivemall/HivemallException.scala b/spark/spark-common/src/main/scala/hivemall/HivemallException.scala deleted file mode 100644 index 53f6756..0000000 --- a/spark/spark-common/src/main/scala/hivemall/HivemallException.scala +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall - -class HivemallException(message: String, cause: Throwable) - extends Exception(message, cause) { - - def this(message: String) = this(message, null) -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e2fb6f86/spark/spark-common/src/main/scala/org/apache/spark/ml/feature/HivemallLabeledPoint.scala ---------------------------------------------------------------------- diff --git a/spark/spark-common/src/main/scala/org/apache/spark/ml/feature/HivemallLabeledPoint.scala b/spark/spark-common/src/main/scala/org/apache/spark/ml/feature/HivemallLabeledPoint.scala deleted file mode 100644 index 3fb2d18..0000000 --- a/spark/spark-common/src/main/scala/org/apache/spark/ml/feature/HivemallLabeledPoint.scala +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.spark.ml.feature - -import java.util.StringTokenizer - -import scala.collection.mutable.ListBuffer - -import hivemall.HivemallException - -// Used for DataFrame#explode -case class HivemallFeature(feature: String) - -/** - * Class that represents the features and labels of a data point for Hivemall. - * - * @param label Label for this data point. - * @param features List of features for this data point. - */ -case class HivemallLabeledPoint(label: Float = 0.0f, features: Seq[String]) { - override def toString: String = { - "%s,%s".format(label, features.mkString("[", ",", "]")) - } -} - -object HivemallLabeledPoint { - - // Simple parser for HivemallLabeledPoint - def parse(s: String): HivemallLabeledPoint = { - val (label, features) = s.indexOf(',') match { - case d if d > 0 => (s.substring(0, d), s.substring(d + 1)) - case _ => ("0.0", "[]") // Dummy - } - HivemallLabeledPoint(label.toFloat, parseTuple(new StringTokenizer(features, "[],", true))) - } - - // TODO: Support to parse rows without labels - private[this] def parseTuple(tokenizer: StringTokenizer): Seq[String] = { - val items = ListBuffer.empty[String] - var parsing = true - var allowDelim = false - while (parsing && tokenizer.hasMoreTokens()) { - val token = tokenizer.nextToken() - if (token == "[") { - items ++= parseTuple(tokenizer) - parsing = false - allowDelim = true - } else if (token == ",") { - if (allowDelim) { - allowDelim = false - } else { - throw new HivemallException("Found ',' at a wrong position.") - } - } else if (token == "]") { - parsing = false - } else { - items.append(token) - allowDelim = true - } - } - if (parsing) { - throw new HivemallException(s"A tuple must end with ']'.") - } - items - } -}
