http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/pom.xml ---------------------------------------------------------------------- diff --git a/spark/pom.xml b/spark/pom.xml new file mode 100644 index 0000000..d018b8d --- /dev/null +++ b/spark/pom.xml @@ -0,0 +1,295 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall</artifactId> + <version>0.5.1-incubating-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + + <artifactId>hivemall-spark</artifactId> + <packaging>pom</packaging> + <name>Hivemall on Apache Spark</name> + + <modules> + <module>common</module> + <module>spark-2.0</module> + <module>spark-2.1</module> + <module>spark-2.2</module> + </modules> + + <properties> + <main.basedir>${project.parent.basedir}</main.basedir> + <scala.version>2.11.8</scala.version> + <scala.binary.version>2.11</scala.binary.version> + <scalatest.jvm.opts>-ea -Xms768m -Xmx1024m -XX:PermSize=128m -XX:MaxPermSize=512m -XX:ReservedCodeCacheSize=512m</scalatest.jvm.opts> + </properties> + + <dependencyManagement> + <dependencies> + <!-- compile scope --> + <dependency> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall-core</artifactId> + <version>${project.version}</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall-xgboost</artifactId> + <version>${project.version}</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-compress</artifactId> + <version>1.8</version> + <scope>compile</scope> + </dependency> + + <!-- provided scope --> + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-library</artifactId> + <version>${scala.version}</version> + <scope>provided</scope> + </dependency> + + <!-- test dependencies --> + <dependency> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall-mixserv</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_${scala.binary.version}</artifactId> + <version>2.2.4</version> + <scope>test</scope> + </dependency> + </dependencies> + </dependencyManagement> + + <build> + <directory>target</directory> + <outputDirectory>target/classes</outputDirectory> + <finalName>${project.artifactId}-${project.version}</finalName> + <testOutputDirectory>target/test-classes</testOutputDirectory> + + <pluginManagement> + <plugins> + <plugin> + <groupId>net.alchim31.maven</groupId> + <artifactId>scala-maven-plugin</artifactId> + <version>3.2.2</version> + </plugin> + <plugin> + <groupId>org.scalatest</groupId> + <artifactId>scalatest-maven-plugin</artifactId> + <version>1.0</version> + <configuration> + <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory> + <junitxml>.</junitxml> + <filereports>SparkTestSuite.txt</filereports> + <argLine>${scalatest.jvm.opts}</argLine> + <stderr /> + <environmentVariables> + <SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES> + <SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION> + <SPARK_TESTING>1</SPARK_TESTING> + <JAVA_HOME>${env.JAVA_HOME}</JAVA_HOME> + <PATH>${env.JAVA_HOME}/bin:${env.PATH}</PATH> + </environmentVariables> + <systemProperties> + <log4j.configuration>file:src/test/resources/log4j.properties</log4j.configuration> + <derby.system.durability>test</derby.system.durability> + <java.awt.headless>true</java.awt.headless> + <java.io.tmpdir>${project.build.directory}/tmp</java.io.tmpdir> + <spark.testing>1</spark.testing> + <spark.ui.enabled>false</spark.ui.enabled> + <spark.ui.showConsoleProgress>false</spark.ui.showConsoleProgress> + <spark.unsafe.exceptionOnMemoryLeak>true</spark.unsafe.exceptionOnMemoryLeak> + <!-- Needed by sql/hive tests. --> + <test.src.tables>__not_used__</test.src.tables> + </systemProperties> + <tagsToExclude>${test.exclude.tags}</tagsToExclude> + </configuration> + </plugin> + <!-- hivemall-spark_xx-xx-with-dependencies.jar including minimum dependencies --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <executions> + <execution> + <id>jar-with-dependencies</id> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <finalName>${project.artifactId}-${project.version}-with-dependencies</finalName> + <outputDirectory>${main.basedir}/target</outputDirectory> + <minimizeJar>false</minimizeJar> + <createDependencyReducedPom>false</createDependencyReducedPom> + <createSourcesJar>true</createSourcesJar> + <artifactSet> + <includes> + <include>org.apache.hivemall:hivemall-spark-common</include> + <!-- hivemall-core --> + <include>org.apache.hivemall:hivemall-core</include> + <include>io.netty:netty-all</include> + <include>com.github.haifengl:smile-core</include> + <include>com.github.haifengl:smile-math</include> + <include>com.github.haifengl:smile-data</include> + <include>org.tukaani:xz</include> + <include>org.apache.commons:commons-math3</include> + <include>org.roaringbitmap:RoaringBitmap</include> + <include>it.unimi.dsi:fastutil</include> + <include>com.clearspring.analytics:stream</include> + <!-- hivemall-nlp --> + <include>org.apache.hivemall:hivemall-nlp</include> + <include>org.apache.lucene:lucene-analyzers-kuromoji</include> + <include>org.apache.lucene:lucene-analyzers-smartcn</include> + <include>org.apache.lucene:lucene-analyzers-common</include> + <include>org.apache.lucene:lucene-core</include> + <!-- hivemall-xgboost --> + <include>org.apache.hivemall:hivemall-xgboost</include> + <include>io.github.myui:xgboost4j</include> + <include>com.esotericsoftware.kryo:kryo</include> + </includes> + </artifactSet> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> + <manifestEntries> + <Implementation-Title>${project.name}</Implementation-Title> + <Implementation-Version>${project.version}</Implementation-Version> + <Implementation-Vendor>${project.organization.name}</Implementation-Vendor> + </manifestEntries> + </transformer> + </transformers> + <filters> + <filter> + <artifact>org.apache.lucene:*</artifact> + <includes> + <include>**</include> + </includes> + </filter> + <filter> + <artifact>com.esotericsoftware.kryo:kryo</artifact> + <includes> + <include>**</include> + </includes> + </filter> + <filter> + <artifact>*:*</artifact> + <excludes> + <exclude>META-INF/LICENSE.txt</exclude> + <exclude>META-INF/NOTICE.txt</exclude> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + <exclude>*.jar</exclude> + <exclude>tracker.py</exclude> + </excludes> + </filter> + </filters> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.scalastyle</groupId> + <artifactId>scalastyle-maven-plugin</artifactId> + <version>0.8.0</version> + </plugin> + </plugins> + </pluginManagement> + + <plugins> + <plugin> + <groupId>org.scalastyle</groupId> + <artifactId>scalastyle-maven-plugin</artifactId> + <configuration> + <verbose>false</verbose> + <failOnViolation>true</failOnViolation> + <includeTestSourceDirectory>true</includeTestSourceDirectory> + <failOnWarning>false</failOnWarning> + <sourceDirectory>${basedir}/src/main/scala</sourceDirectory> + <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory> + <configLocation>spark/scalastyle-config.xml</configLocation> + <outputFile>${basedir}/target/scalastyle-output.xml</outputFile> + <inputEncoding>${project.build.sourceEncoding}</inputEncoding> + <outputEncoding>${project.reporting.outputEncoding}</outputEncoding> + </configuration> + <executions> + <execution> + <goals> + <goal>check</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>net.alchim31.maven</groupId> + <artifactId>scala-maven-plugin</artifactId> + <executions> + <execution> + <id>scala-compile-first</id> + <phase>process-resources</phase> + <goals> + <goal>add-source</goal> + <goal>compile</goal> + </goals> + </execution> + <execution> + <id>scala-test-compile</id> + <phase>process-test-resources</phase> + <goals> + <goal>testCompile</goal> + </goals> + </execution> + </executions> + <!-- For incremental compilation --> + <configuration> + <scalaVersion>${scala.version}</scalaVersion> + <recompileMode>incremental</recompileMode> + <useZincServer>true</useZincServer> + <args> + <arg>-unchecked</arg> + <arg>-deprecation</arg> + <!-- TODO: To enable this option, we need to fix many wornings --> + <!-- <arg>-feature</arg> --> + </args> + <jvmArgs> + <jvmArg>-Xms768m</jvmArg> + <jvmArg>-Xmx1024m</jvmArg> + <jvmArg>-XX:PermSize=128m</jvmArg> + <jvmArg>-XX:MaxPermSize=512m</jvmArg> + <jvmArg>-XX:ReservedCodeCacheSize=512m</jvmArg> + </jvmArgs> + </configuration> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/scalastyle-config.xml ---------------------------------------------------------------------- diff --git a/spark/scalastyle-config.xml b/spark/scalastyle-config.xml new file mode 100644 index 0000000..13d1c47 --- /dev/null +++ b/spark/scalastyle-config.xml @@ -0,0 +1,333 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<!-- +If you wish to turn off checking for a section of code, you can put a comment in the source +before and after the section, with the following syntax: + + // scalastyle:off + ... // stuff that breaks the styles + // scalastyle:on + +You can also disable only one rule, by specifying its rule id, as specified in: + http://www.scalastyle.org/rules-0.7.0.html + + // scalastyle:off no.finalize + override def finalize(): Unit = ... + // scalastyle:on no.finalize + +This file is divided into 3 sections: + (1) rules that we enforce. + (2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet + (or we need to make the scalastyle rule more configurable). + (3) rules that we don't want to enforce. +--> + +<scalastyle> + <name>Scalastyle standard configuration</name> + + <!-- ================================================================================ --> + <!-- rules we enforce --> + <!-- ================================================================================ --> + + <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true"> + <parameters> + <parameter name="header"><![CDATA[/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */]]></parameter> + </parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true"> + <parameters> + <parameter name="maxLineLength"><![CDATA[100]]></parameter> + <parameter name="tabSize"><![CDATA[2]]></parameter> + <parameter name="ignoreImports">true</parameter> + </parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true"> + <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true"> + <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true"> + <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true"> + <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true"> + <parameters> + <parameter name="singleLineAllowed"><![CDATA[true]]></parameter> + <parameter name="doubleLineAllowed"><![CDATA[true]]></parameter> + </parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check> + + <check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check> + + <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true"> + <parameters> + <parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter> + </parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true"> + <parameters> + <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter> + </parameters> + </check> + + <!-- ??? usually shouldn't be checked into the code base. --> + <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check> + + <!-- As of SPARK-7977 all printlns need to be wrapped in '// scalastyle:off/on println' --> + <check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> + <parameters><parameter name="regex">^println$</parameter></parameters> + <customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with + // scalastyle:off println + println(...) + // scalastyle:on println]]></customMessage> + </check> + + <check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">@VisibleForTesting</parameter></parameters> + <customMessage><![CDATA[ + @VisibleForTesting causes classpath issues. Please note this in the java doc instead (SPARK-11615). + ]]></customMessage> + </check> + + <check customId="runtimeaddshutdownhook" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">Runtime\.getRuntime\.addShutdownHook</parameter></parameters> + <customMessage><![CDATA[ + Are you sure that you want to use Runtime.getRuntime.addShutdownHook? In most cases, you should use + ShutdownHookManager.addShutdownHook instead. + If you must use Runtime.getRuntime.addShutdownHook, wrap the code block with + // scalastyle:off runtimeaddshutdownhook + Runtime.getRuntime.addShutdownHook(...) + // scalastyle:on runtimeaddshutdownhook + ]]></customMessage> + </check> + + <check customId="mutablesynchronizedbuffer" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">mutable\.SynchronizedBuffer</parameter></parameters> + <customMessage><![CDATA[ + Are you sure that you want to use mutable.SynchronizedBuffer? In most cases, you should use + java.util.concurrent.ConcurrentLinkedQueue instead. + If you must use mutable.SynchronizedBuffer, wrap the code block with + // scalastyle:off mutablesynchronizedbuffer + mutable.SynchronizedBuffer[...] + // scalastyle:on mutablesynchronizedbuffer + ]]></customMessage> + </check> + + <check customId="classforname" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">Class\.forName</parameter></parameters> + <customMessage><![CDATA[ + Are you sure that you want to use Class.forName? In most cases, you should use Utils.classForName instead. + If you must use Class.forName, wrap the code block with + // scalastyle:off classforname + Class.forName(...) + // scalastyle:on classforname + ]]></customMessage> + </check> + + <check customId="awaitresult" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">Await\.result</parameter></parameters> + <customMessage><![CDATA[ + Are you sure that you want to use Await.result? In most cases, you should use ThreadUtils.awaitResult instead. + If you must use Await.result, wrap the code block with + // scalastyle:off awaitresult + Await.result(...) + // scalastyle:on awaitresult + ]]></customMessage> + </check> + + <!-- As of SPARK-9613 JavaConversions should be replaced with JavaConverters --> + <check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> + <parameters><parameter name="regex">JavaConversions</parameter></parameters> + <customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import + scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage> + </check> + + <check customId="commonslang2" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> + <parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters> + <customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead + of Commons Lang 2 (package org.apache.commons.lang.*)</customMessage> + </check> + + <check level="error" class="org.scalastyle.scalariform.ImportOrderChecker" enabled="true"> + <parameters> + <parameter name="groups">java,scala,3rdParty,spark</parameter> + <parameter name="group.java">javax?\..*</parameter> + <parameter name="group.scala">scala\..*</parameter> + <parameter name="group.3rdParty">(?!org\.apache\.spark\.).*</parameter> + <parameter name="group.spark">org\.apache\.spark\..*</parameter> + </parameters> + </check> + + <check level="error" class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" enabled="true"> + <parameters> + <parameter name="tokens">COMMA</parameter> + </parameters> + </check> + + <!-- SPARK-3854: Single Space between ')' and '{' --> + <check customId="SingleSpaceBetweenRParenAndLCurlyBrace" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">\)\{</parameter></parameters> + <customMessage><![CDATA[ + Single Space between ')' and `{`. + ]]></customMessage> + </check> + + <check customId="NoScalaDoc" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">(?m)^(\s*)/[*][*].*$(\r|)\n^\1 [*]</parameter></parameters> + <customMessage>Use Javadoc style indentation for multiline comments</customMessage> + </check> + + <check customId="OmitBracesInCase" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">case[^\n>]*=>\s*\{</parameter></parameters> + <customMessage>Omit braces in case clauses.</customMessage> + </check> + + <!-- SPARK-16877: Avoid Java annotations --> + <check customId="OverrideJavaCase" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> + <parameters><parameter name="regex">^Override$</parameter></parameters> + <customMessage>override modifier should be used instead of @java.lang.Override.</customMessage> + </check> + + <check level="error" class="org.scalastyle.scalariform.DeprecatedJavaChecker" enabled="true"></check> + + <!-- ================================================================================ --> + <!-- rules we'd like to enforce, but haven't cleaned up the codebase yet --> + <!-- ================================================================================ --> + + <!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. --> + <!-- Ideally the following two rules should be configurable to rule out string interpolation. --> + <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check> + <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check> + + <!-- This breaks symbolic method names so we don't turn it on. --> + <!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. --> + <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false"> + <parameters> + <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter> + </parameters> + </check> + + <!-- Should turn this on, but we have a few places that need to be fixed first --> + <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check> + + <!-- ================================================================================ --> + <!-- rules we don't want --> + <!-- ================================================================================ --> + + <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false"> + <parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters> + </check> + + <!-- We want the opposite of this: NewLineAtEofChecker --> + <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check> + + <!-- This one complains about all kinds of random things. Disable. --> + <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check> + + <!-- We use return quite a bit for control flows and guards --> + <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check> + + <!-- We use null a lot in low level code and to interface with 3rd party code --> + <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check> + + <!-- Doesn't seem super big deal here ... --> + <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"></check> + + <!-- Doesn't seem super big deal here ... --> + <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false"> + <parameters><parameter name="maxFileLength">800></parameter></parameters> + </check> + + <!-- Doesn't seem super big deal here ... --> + <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false"> + <parameters><parameter name="maxTypes">30</parameter></parameters> + </check> + + <!-- Doesn't seem super big deal here ... --> + <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false"> + <parameters><parameter name="maximum">10</parameter></parameters> + </check> + + <!-- Doesn't seem super big deal here ... --> + <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false"> + <parameters><parameter name="maxLength">50</parameter></parameters> + </check> + + <!-- Not exactly feasible to enforce this right now. --> + <!-- It is also infrequent that somebody introduces a new class with a lot of methods. --> + <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false"> + <parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters> + </check> + + <!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... --> + <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false"> + <parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters> + </check> + +</scalastyle> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.0/pom.xml ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/pom.xml b/spark/spark-2.0/pom.xml index e197586..54c817d 100644 --- a/spark/spark-2.0/pom.xml +++ b/spark/spark-2.0/pom.xml @@ -16,37 +16,36 @@ specific language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.apache.hivemall</groupId> - <artifactId>hivemall</artifactId> - <version>0.5.0-incubating-SNAPSHOT</version> - <relativePath>../../pom.xml</relativePath> + <artifactId>hivemall-spark</artifactId> + <version>0.5.1-incubating-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> </parent> - <artifactId>hivemall-spark</artifactId> + <artifactId>hivemall-spark2.0</artifactId> <name>Hivemall on Spark 2.0</name> <packaging>jar</packaging> <properties> - <main.basedir>${project.parent.basedir}</main.basedir> + <main.basedir>${project.parent.parent.basedir}</main.basedir> + <spark.version>2.0.2</spark.version> + <spark.binary.version>2.0</spark.binary.version> </properties> <dependencies> - <!-- hivemall dependencies --> + <!-- compile scope --> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-core</artifactId> - <version>${project.version}</version> <scope>compile</scope> </dependency> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-xgboost</artifactId> - <version>${project.version}</version> <scope>compile</scope> </dependency> <dependency> @@ -56,21 +55,12 @@ <scope>compile</scope> </dependency> - <!-- third-party dependencies --> + <!-- provided scope --> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> - <version>${scala.version}</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-compress</artifactId> - <version>1.8</version> - <scope>compile</scope> + <scope>provided</scope> </dependency> - - <!-- other provided dependencies --> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_${scala.binary.version}</artifactId> @@ -106,114 +96,26 @@ <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-mixserv</artifactId> - <version>${project.version}</version> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.xerial</groupId> - <artifactId>xerial-core</artifactId> - <version>3.2.3</version> <scope>test</scope> </dependency> <dependency> <groupId>org.scalatest</groupId> <artifactId>scalatest_${scala.binary.version}</artifactId> - <version>2.2.4</version> <scope>test</scope> </dependency> </dependencies> <build> - <directory>target</directory> - <outputDirectory>target/classes</outputDirectory> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}</finalName> - <testOutputDirectory>target/test-classes</testOutputDirectory> <plugins> - <!-- For incremental compilation --> - <plugin> - <groupId>net.alchim31.maven</groupId> - <artifactId>scala-maven-plugin</artifactId> - <version>3.2.2</version> - <executions> - <execution> - <id>scala-compile-first</id> - <phase>process-resources</phase> - <goals> - <goal>compile</goal> - </goals> - </execution> - <execution> - <id>scala-test-compile-first</id> - <phase>process-test-resources</phase> - <goals> - <goal>testCompile</goal> - </goals> - </execution> - </executions> - <configuration> - <scalaVersion>${scala.version}</scalaVersion> - <recompileMode>incremental</recompileMode> - <useZincServer>true</useZincServer> - <args> - <arg>-unchecked</arg> - <arg>-deprecation</arg> - <!-- TODO: To enable this option, we need to fix many wornings --> - <!-- <arg>-feature</arg> --> - </args> - <jvmArgs> - <jvmArg>-Xms512m</jvmArg> - <jvmArg>-Xmx1024m</jvmArg> - </jvmArgs> - </configuration> - </plugin> - <!-- hivemall-spark_xx-xx.jar --> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <version>2.5</version> - <configuration> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> - </configuration> - </plugin> <!-- hivemall-spark_xx-xx-with-dependencies.jar including minimum dependencies --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> - <version>3.1.0</version> - <executions> - <execution> - <id>jar-with-dependencies</id> - <phase>package</phase> - <goals> - <goal>shade</goal> - </goals> - <configuration> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}-with-dependencies</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> - <minimizeJar>false</minimizeJar> - <createDependencyReducedPom>false</createDependencyReducedPom> - <artifactSet> - <includes> - <include>org.apache.hivemall:hivemall-core</include> - <include>org.apache.hivemall:hivemall-xgboost</include> - <include>org.apache.hivemall:hivemall-spark-common</include> - <include>com.github.haifengl:smile-core</include> - <include>com.github.haifengl:smile-math</include> - <include>com.github.haifengl:smile-data</include> - <include>ml.dmlc:xgboost4j</include> - <include>com.esotericsoftware.kryo:kryo</include> - </includes> - </artifactSet> - </configuration> - </execution> - </executions> </plugin> <!-- disable surefire because there is no java test --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> - <version>2.7</version> <configuration> <skipTests>true</skipTests> </configuration> @@ -222,33 +124,6 @@ <plugin> <groupId>org.scalatest</groupId> <artifactId>scalatest-maven-plugin</artifactId> - <version>1.0</version> - <configuration> - <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory> - <junitxml>.</junitxml> - <filereports>SparkTestSuite.txt</filereports> - <argLine>${spark.test.jvm.opts}</argLine> - <stderr /> - <environmentVariables> - <SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES> - <SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION> - <SPARK_TESTING>1</SPARK_TESTING> - <JAVA_HOME>${env.JAVA_HOME}</JAVA_HOME> - </environmentVariables> - <systemProperties> - <log4j.configuration>file:src/test/resources/log4j.properties</log4j.configuration> - <derby.system.durability>test</derby.system.durability> - <java.awt.headless>true</java.awt.headless> - <java.io.tmpdir>${project.build.directory}/tmp</java.io.tmpdir> - <spark.testing>1</spark.testing> - <spark.ui.enabled>false</spark.ui.enabled> - <spark.ui.showConsoleProgress>false</spark.ui.showConsoleProgress> - <spark.unsafe.exceptionOnMemoryLeak>true</spark.unsafe.exceptionOnMemoryLeak> - <!-- Needed by sql/hive tests. --> - <test.src.tables>__not_used__</test.src.tables> - </systemProperties> - <tagsToExclude>${test.exclude.tags}</tagsToExclude> - </configuration> <executions> <execution> <id>test</id> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.0/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala b/spark/spark-2.0/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala new file mode 100644 index 0000000..a6bbb4b --- /dev/null +++ b/spark/spark-2.0/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.streaming + +import scala.reflect.ClassTag + +import org.apache.spark.ml.feature.HivemallLabeledPoint +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.apache.spark.streaming.dstream.DStream + +final class HivemallStreamingOps(ds: DStream[HivemallLabeledPoint]) { + + def predict[U: ClassTag](f: DataFrame => DataFrame)(implicit sqlContext: SQLContext) + : DStream[Row] = { + ds.transform[Row] { rdd: RDD[HivemallLabeledPoint] => + f(sqlContext.createDataFrame(rdd)).rdd + } + } +} + +object HivemallStreamingOps { + + /** + * Implicitly inject the [[HivemallStreamingOps]] into [[DStream]]. + */ + implicit def dataFrameToHivemallStreamingOps(ds: DStream[HivemallLabeledPoint]) + : HivemallStreamingOps = { + new HivemallStreamingOps(ds) + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala b/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala index d3bf435..4a43afc 100644 --- a/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala +++ b/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala @@ -35,7 +35,7 @@ final class HiveUdfWithFeatureSuite extends HivemallFeatureQueryTest { checkAnswer( sql(s"SELECT DISTINCT hivemall_version()"), - Row("0.5.0-incubating-SNAPSHOT") + Row("0.5.1-incubating-SNAPSHOT") ) // sql("DROP TEMPORARY FUNCTION IF EXISTS hivemall_version") http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala index 5e99fd8..399a557 100644 --- a/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala +++ b/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala @@ -293,7 +293,7 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest { } test("misc - hivemall_version") { - checkAnswer(DummyInputData.select(hivemall_version()), Row("0.5.0-incubating-SNAPSHOT")) + checkAnswer(DummyInputData.select(hivemall_version()), Row("0.5.1-incubating-SNAPSHOT")) } test("misc - rowid") { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.1/pom.xml ---------------------------------------------------------------------- diff --git a/spark/spark-2.1/pom.xml b/spark/spark-2.1/pom.xml index 3d07184..e10b4ab 100644 --- a/spark/spark-2.1/pom.xml +++ b/spark/spark-2.1/pom.xml @@ -16,23 +16,24 @@ specific language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.apache.hivemall</groupId> - <artifactId>hivemall</artifactId> - <version>0.5.0-incubating-SNAPSHOT</version> - <relativePath>../../pom.xml</relativePath> + <artifactId>hivemall-spark</artifactId> + <version>0.5.1-incubating-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> </parent> - <artifactId>hivemall-spark</artifactId> + <artifactId>hivemall-spark2.1</artifactId> <name>Hivemall on Spark 2.1</name> <packaging>jar</packaging> <properties> - <main.basedir>${project.parent.basedir}</main.basedir> + <main.basedir>${project.parent.parent.basedir}</main.basedir> + <spark.version>2.1.1</spark.version> + <spark.binary.version>2.1</spark.binary.version> </properties> <dependencies> @@ -40,13 +41,11 @@ <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-core</artifactId> - <version>${project.version}</version> <scope>compile</scope> </dependency> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-xgboost</artifactId> - <version>${project.version}</version> <scope>compile</scope> </dependency> <dependency> @@ -56,21 +55,12 @@ <scope>compile</scope> </dependency> - <!-- third-party dependencies --> + <!-- provided scope --> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> - <version>${scala.version}</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-compress</artifactId> - <version>1.8</version> - <scope>compile</scope> + <scope>provided</scope> </dependency> - - <!-- other provided dependencies --> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_${scala.binary.version}</artifactId> @@ -106,114 +96,26 @@ <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-mixserv</artifactId> - <version>${project.version}</version> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.xerial</groupId> - <artifactId>xerial-core</artifactId> - <version>3.2.3</version> <scope>test</scope> </dependency> <dependency> <groupId>org.scalatest</groupId> <artifactId>scalatest_${scala.binary.version}</artifactId> - <version>2.2.4</version> <scope>test</scope> </dependency> </dependencies> <build> - <directory>target</directory> - <outputDirectory>target/classes</outputDirectory> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}</finalName> - <testOutputDirectory>target/test-classes</testOutputDirectory> <plugins> - <!-- For incremental compilation --> - <plugin> - <groupId>net.alchim31.maven</groupId> - <artifactId>scala-maven-plugin</artifactId> - <version>3.2.2</version> - <executions> - <execution> - <id>scala-compile-first</id> - <phase>process-resources</phase> - <goals> - <goal>compile</goal> - </goals> - </execution> - <execution> - <id>scala-test-compile-first</id> - <phase>process-test-resources</phase> - <goals> - <goal>testCompile</goal> - </goals> - </execution> - </executions> - <configuration> - <scalaVersion>${scala.version}</scalaVersion> - <recompileMode>incremental</recompileMode> - <useZincServer>true</useZincServer> - <args> - <arg>-unchecked</arg> - <arg>-deprecation</arg> - <!-- TODO: To enable this option, we need to fix many wornings --> - <!-- <arg>-feature</arg> --> - </args> - <jvmArgs> - <jvmArg>-Xms512m</jvmArg> - <jvmArg>-Xmx1024m</jvmArg> - </jvmArgs> - </configuration> - </plugin> - <!-- hivemall-spark_xx-xx.jar --> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <version>2.5</version> - <configuration> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> - </configuration> - </plugin> <!-- hivemall-spark_xx-xx-with-dependencies.jar including minimum dependencies --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> - <version>3.1.0</version> - <executions> - <execution> - <id>jar-with-dependencies</id> - <phase>package</phase> - <goals> - <goal>shade</goal> - </goals> - <configuration> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}-with-dependencies</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> - <minimizeJar>false</minimizeJar> - <createDependencyReducedPom>false</createDependencyReducedPom> - <artifactSet> - <includes> - <include>org.apache.hivemall:hivemall-core</include> - <include>org.apache.hivemall:hivemall-xgboost</include> - <include>org.apache.hivemall:hivemall-spark-common</include> - <include>com.github.haifengl:smile-core</include> - <include>com.github.haifengl:smile-math</include> - <include>com.github.haifengl:smile-data</include> - <include>ml.dmlc:xgboost4j</include> - <include>com.esotericsoftware.kryo:kryo</include> - </includes> - </artifactSet> - </configuration> - </execution> - </executions> </plugin> <!-- disable surefire because there is no java test --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> - <version>2.7</version> <configuration> <skipTests>true</skipTests> </configuration> @@ -222,33 +124,6 @@ <plugin> <groupId>org.scalatest</groupId> <artifactId>scalatest-maven-plugin</artifactId> - <version>1.0</version> - <configuration> - <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory> - <junitxml>.</junitxml> - <filereports>SparkTestSuite.txt</filereports> - <argLine>${spark.test.jvm.opts}</argLine> - <stderr /> - <environmentVariables> - <SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES> - <SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION> - <SPARK_TESTING>1</SPARK_TESTING> - <JAVA_HOME>${env.JAVA_HOME}</JAVA_HOME> - </environmentVariables> - <systemProperties> - <log4j.configuration>file:src/test/resources/log4j.properties</log4j.configuration> - <derby.system.durability>test</derby.system.durability> - <java.awt.headless>true</java.awt.headless> - <java.io.tmpdir>${project.build.directory}/tmp</java.io.tmpdir> - <spark.testing>1</spark.testing> - <spark.ui.enabled>false</spark.ui.enabled> - <spark.ui.showConsoleProgress>false</spark.ui.showConsoleProgress> - <spark.unsafe.exceptionOnMemoryLeak>true</spark.unsafe.exceptionOnMemoryLeak> - <!-- Needed by sql/hive tests. --> - <test.src.tables>__not_used__</test.src.tables> - </systemProperties> - <tagsToExclude>${test.exclude.tags}</tagsToExclude> - </configuration> <executions> <execution> <id>test</id> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.1/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.1/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala b/spark/spark-2.1/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala new file mode 100644 index 0000000..a6bbb4b --- /dev/null +++ b/spark/spark-2.1/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.streaming + +import scala.reflect.ClassTag + +import org.apache.spark.ml.feature.HivemallLabeledPoint +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.apache.spark.streaming.dstream.DStream + +final class HivemallStreamingOps(ds: DStream[HivemallLabeledPoint]) { + + def predict[U: ClassTag](f: DataFrame => DataFrame)(implicit sqlContext: SQLContext) + : DStream[Row] = { + ds.transform[Row] { rdd: RDD[HivemallLabeledPoint] => + f(sqlContext.createDataFrame(rdd)).rdd + } + } +} + +object HivemallStreamingOps { + + /** + * Implicitly inject the [[HivemallStreamingOps]] into [[DStream]]. + */ + implicit def dataFrameToHivemallStreamingOps(ds: DStream[HivemallLabeledPoint]) + : HivemallStreamingOps = { + new HivemallStreamingOps(ds) + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.1/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.1/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala b/spark/spark-2.1/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala index eb4ec04..cecceca 100644 --- a/spark/spark-2.1/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala +++ b/spark/spark-2.1/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala @@ -35,7 +35,7 @@ final class HiveUdfWithFeatureSuite extends HivemallFeatureQueryTest { checkAnswer( sql(s"SELECT DISTINCT hivemall_version()"), - Row("0.5.0-incubating-SNAPSHOT") + Row("0.5.1-incubating-SNAPSHOT") ) // sql("DROP TEMPORARY FUNCTION IF EXISTS hivemall_version") http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.1/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.1/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-2.1/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala index 84ab0cd..8dad4c3 100644 --- a/spark/spark-2.1/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala +++ b/spark/spark-2.1/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala @@ -295,7 +295,7 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest { } test("misc - hivemall_version") { - checkAnswer(DummyInputData.select(hivemall_version()), Row("0.5.0-incubating-SNAPSHOT")) + checkAnswer(DummyInputData.select(hivemall_version()), Row("0.5.1-incubating-SNAPSHOT")) } test("misc - rowid") { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.2/pom.xml ---------------------------------------------------------------------- diff --git a/spark/spark-2.2/pom.xml b/spark/spark-2.2/pom.xml index 5366e1d..47aea92 100644 --- a/spark/spark-2.2/pom.xml +++ b/spark/spark-2.2/pom.xml @@ -16,40 +16,40 @@ specific language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.apache.hivemall</groupId> - <artifactId>hivemall</artifactId> - <version>0.5.0-incubating-SNAPSHOT</version> - <relativePath>../../pom.xml</relativePath> + <artifactId>hivemall-spark</artifactId> + <version>0.5.1-incubating-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> </parent> - <artifactId>hivemall-spark</artifactId> + <artifactId>hivemall-spark2.2</artifactId> <name>Hivemall on Spark 2.2</name> <packaging>jar</packaging> <properties> - <PermGen>64m</PermGen> - <MaxPermGen>512m</MaxPermGen> - <CodeCacheSize>512m</CodeCacheSize> - <main.basedir>${project.parent.basedir}</main.basedir> + <main.basedir>${project.parent.parent.basedir}</main.basedir> + <spark.version>2.2.0</spark.version> + <spark.binary.version>2.2</spark.binary.version> + <hadoop.version>2.6.5</hadoop.version> + <scalatest.jvm.opts>-ea -Xms768m -Xmx2g -XX:MetaspaceSize=128m -XX:MaxMetaspaceSize=512m -XX:ReservedCodeCacheSize=512m</scalatest.jvm.opts> + <maven.compiler.source>1.8</maven.compiler.source> + <maven.compiler.target>1.8</maven.compiler.target> </properties> <dependencies> - <!-- hivemall dependencies --> + <!-- compile scope --> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-core</artifactId> - <version>${project.version}</version> <scope>compile</scope> </dependency> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-xgboost</artifactId> - <version>${project.version}</version> <scope>compile</scope> </dependency> <dependency> @@ -59,22 +59,13 @@ <scope>compile</scope> </dependency> - <!-- third-party dependencies --> + <!-- provided scope --> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> - <version>${scala.version}</version> - <scope>compile</scope> + <scope>provided</scope> </dependency> <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-compress</artifactId> - <version>1.8</version> - <scope>compile</scope> - </dependency> - - <!-- other provided dependencies --> - <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_${scala.binary.version}</artifactId> <version>${spark.version}</version> @@ -109,117 +100,26 @@ <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-mixserv</artifactId> - <version>${project.version}</version> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.xerial</groupId> - <artifactId>xerial-core</artifactId> - <version>3.2.3</version> <scope>test</scope> </dependency> <dependency> <groupId>org.scalatest</groupId> <artifactId>scalatest_${scala.binary.version}</artifactId> - <version>2.2.4</version> <scope>test</scope> </dependency> </dependencies> <build> - <directory>target</directory> - <outputDirectory>target/classes</outputDirectory> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}</finalName> - <testOutputDirectory>target/test-classes</testOutputDirectory> <plugins> - <!-- For incremental compilation --> - <plugin> - <groupId>net.alchim31.maven</groupId> - <artifactId>scala-maven-plugin</artifactId> - <version>3.2.2</version> - <executions> - <execution> - <id>scala-compile-first</id> - <phase>process-resources</phase> - <goals> - <goal>compile</goal> - </goals> - </execution> - <execution> - <id>scala-test-compile-first</id> - <phase>process-test-resources</phase> - <goals> - <goal>testCompile</goal> - </goals> - </execution> - </executions> - <configuration> - <scalaVersion>${scala.version}</scalaVersion> - <recompileMode>incremental</recompileMode> - <useZincServer>true</useZincServer> - <args> - <arg>-unchecked</arg> - <arg>-deprecation</arg> - <!-- TODO: To enable this option, we need to fix many wornings --> - <!-- <arg>-feature</arg> --> - </args> - <jvmArgs> - <jvmArg>-Xms1024m</jvmArg> - <jvmArg>-Xmx1024m</jvmArg> - <jvmArg>-XX:PermSize=${PermGen}</jvmArg> - <jvmArg>-XX:MaxPermSize=${MaxPermGen}</jvmArg> - <jvmArg>-XX:ReservedCodeCacheSize=${CodeCacheSize}</jvmArg> - </jvmArgs> - </configuration> - </plugin> - <!-- hivemall-spark_xx-xx.jar --> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <version>2.5</version> - <configuration> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> - </configuration> - </plugin> <!-- hivemall-spark_xx-xx-with-dependencies.jar including minimum dependencies --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> - <version>3.1.0</version> - <executions> - <execution> - <id>jar-with-dependencies</id> - <phase>package</phase> - <goals> - <goal>shade</goal> - </goals> - <configuration> - <finalName>${project.artifactId}-${spark.binary.version}_${scala.binary.version}-${project.version}-with-dependencies</finalName> - <outputDirectory>${project.parent.build.directory}</outputDirectory> - <minimizeJar>false</minimizeJar> - <createDependencyReducedPom>false</createDependencyReducedPom> - <artifactSet> - <includes> - <include>org.apache.hivemall:hivemall-core</include> - <include>org.apache.hivemall:hivemall-xgboost</include> - <include>org.apache.hivemall:hivemall-spark-common</include> - <include>com.github.haifengl:smile-core</include> - <include>com.github.haifengl:smile-math</include> - <include>com.github.haifengl:smile-data</include> - <include>ml.dmlc:xgboost4j</include> - <include>com.esotericsoftware.kryo:kryo</include> - </includes> - </artifactSet> - </configuration> - </execution> - </executions> </plugin> <!-- disable surefire because there is no java test --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> - <version>2.7</version> <configuration> <skipTests>true</skipTests> </configuration> @@ -228,33 +128,6 @@ <plugin> <groupId>org.scalatest</groupId> <artifactId>scalatest-maven-plugin</artifactId> - <version>1.0</version> - <configuration> - <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory> - <junitxml>.</junitxml> - <filereports>SparkTestSuite.txt</filereports> - <argLine>-ea -Xmx2g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine> - <stderr /> - <environmentVariables> - <SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES> - <SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION> - <SPARK_TESTING>1</SPARK_TESTING> - <JAVA_HOME>${env.JAVA_HOME}</JAVA_HOME> - </environmentVariables> - <systemProperties> - <log4j.configuration>file:src/test/resources/log4j.properties</log4j.configuration> - <derby.system.durability>test</derby.system.durability> - <java.awt.headless>true</java.awt.headless> - <java.io.tmpdir>${project.build.directory}/tmp</java.io.tmpdir> - <spark.testing>1</spark.testing> - <spark.ui.enabled>false</spark.ui.enabled> - <spark.ui.showConsoleProgress>false</spark.ui.showConsoleProgress> - <spark.unsafe.exceptionOnMemoryLeak>true</spark.unsafe.exceptionOnMemoryLeak> - <!-- Needed by sql/hive tests. --> - <test.src.tables>__not_used__</test.src.tables> - </systemProperties> - <tagsToExclude>${test.exclude.tags}</tagsToExclude> - </configuration> <executions> <execution> <id>test</id> @@ -264,6 +137,16 @@ </execution> </executions> </plugin> + <plugin> + <groupId>org.scalatest</groupId> + <artifactId>scalatest-maven-plugin</artifactId> + <configuration> + <environmentVariables> + <JAVA_HOME>${env.JAVA8_HOME}</JAVA_HOME> + <PATH>${env.JAVA8_HOME}/bin:${env.PATH}</PATH> + </environmentVariables> + </configuration> + </plugin> </plugins> </build> </project> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala b/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala index 00617b7..2982d9c 100644 --- a/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala +++ b/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala @@ -127,7 +127,7 @@ final class HivemallGroupedDataset(groupBy: RelationalGroupedDataset) { * @group ensemble */ def max_label(score: String, label: String): DataFrame = { - checkType(score, DoubleType) + // checkType(score, DoubleType) checkType(label, StringType) val udaf = HiveUDAFFunction( "max_label", http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.2/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.2/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala b/spark/spark-2.2/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala new file mode 100644 index 0000000..a6bbb4b --- /dev/null +++ b/spark/spark-2.2/src/main/scala/org/apache/spark/streaming/HivemallStreamingOps.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.streaming + +import scala.reflect.ClassTag + +import org.apache.spark.ml.feature.HivemallLabeledPoint +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.apache.spark.streaming.dstream.DStream + +final class HivemallStreamingOps(ds: DStream[HivemallLabeledPoint]) { + + def predict[U: ClassTag](f: DataFrame => DataFrame)(implicit sqlContext: SQLContext) + : DStream[Row] = { + ds.transform[Row] { rdd: RDD[HivemallLabeledPoint] => + f(sqlContext.createDataFrame(rdd)).rdd + } + } +} + +object HivemallStreamingOps { + + /** + * Implicitly inject the [[HivemallStreamingOps]] into [[DStream]]. + */ + implicit def dataFrameToHivemallStreamingOps(ds: DStream[HivemallLabeledPoint]) + : HivemallStreamingOps = { + new HivemallStreamingOps(ds) + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala b/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala index 1e1c574..f16eae0 100644 --- a/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala +++ b/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HiveUdfSuite.scala @@ -36,7 +36,7 @@ final class HiveUdfWithFeatureSuite extends HivemallFeatureQueryTest { checkAnswer( sql(s"SELECT DISTINCT hivemall_version()"), - Row("0.5.0-incubating-SNAPSHOT") + Row("0.5.1-incubating-SNAPSHOT") ) // sql("DROP TEMPORARY FUNCTION IF EXISTS hivemall_version") http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala index f73cb75..f2b7b6e 100644 --- a/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala +++ b/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala @@ -562,7 +562,7 @@ class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest { } test("misc - hivemall_version") { - checkAnswer(DummyInputData.select(hivemall_version()), Row("0.5.0-incubating-SNAPSHOT")) + checkAnswer(DummyInputData.select(hivemall_version()), Row("0.5.1-incubating-SNAPSHOT")) } test("misc - rowid") { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-common/pom.xml ---------------------------------------------------------------------- diff --git a/spark/spark-common/pom.xml b/spark/spark-common/pom.xml deleted file mode 100644 index 50670d3..0000000 --- a/spark/spark-common/pom.xml +++ /dev/null @@ -1,146 +0,0 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - - <parent> - <groupId>org.apache.hivemall</groupId> - <artifactId>hivemall</artifactId> - <version>0.5.0-incubating-SNAPSHOT</version> - <relativePath>../../pom.xml</relativePath> - </parent> - - <artifactId>hivemall-spark-common</artifactId> - <name>Hivemall on Spark Common</name> - <packaging>jar</packaging> - - <properties> - <main.basedir>${project.parent.basedir}</main.basedir> - </properties> - - <dependencies> - <!-- hivemall dependencies --> - <dependency> - <groupId>org.apache.hivemall</groupId> - <artifactId>hivemall-core</artifactId> - <version>${project.version}</version> - <scope>compile</scope> - </dependency> - - <!-- other provided dependencies --> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-sql_${scala.binary.version}</artifactId> - <version>${spark.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-hive_${scala.binary.version}</artifactId> - <version>${spark.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-streaming_${scala.binary.version}</artifactId> - <version>${spark.version}</version> - <scope>provided</scope> - </dependency> - - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-common</artifactId> - <version>${hadoop.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-mapreduce-client-core</artifactId> - <version>${hadoop.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hive</groupId> - <artifactId>hive-exec</artifactId> - <version>${hive.version}</version> - <scope>provided</scope> - </dependency> - </dependencies> - - <build> - <directory>target</directory> - <outputDirectory>target/classes</outputDirectory> - <finalName>${project.artifactId}-${project.version}</finalName> - <testOutputDirectory>target/test-classes</testOutputDirectory> - <plugins> - <!-- For resolving spark binary incompatibility --> - <plugin> - <artifactId>maven-clean-plugin</artifactId> - <version>3.0.0</version> - <executions> - <execution> - <phase>initialize</phase> - <goals> - <goal>clean</goal> - </goals> - </execution> - </executions> - </plugin> - <!-- For incremental compilation --> - <plugin> - <groupId>net.alchim31.maven</groupId> - <artifactId>scala-maven-plugin</artifactId> - <version>3.2.2</version> - <executions> - <execution> - <id>scala-compile-first</id> - <phase>process-resources</phase> - <goals> - <goal>compile</goal> - </goals> - </execution> - <execution> - <id>scala-test-compile-first</id> - <phase>process-test-resources</phase> - <goals> - <goal>testCompile</goal> - </goals> - </execution> - </executions> - <configuration> - <scalaVersion>${scala.version}</scalaVersion> - <recompileMode>incremental</recompileMode> - <useZincServer>true</useZincServer> - <args> - <arg>-unchecked</arg> - <arg>-deprecation</arg> - <!-- TODO: To enable this option, we need to fix many wornings --> - <!-- <arg>-feature</arg> --> - </args> - <jvmArgs> - <jvmArg>-Xms512m</jvmArg> - <jvmArg>-Xmx1024m</jvmArg> - </jvmArgs> - </configuration> - </plugin> - </plugins> - </build> -</project> - http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3a718713/spark/spark-common/scalastyle-config.xml ---------------------------------------------------------------------- diff --git a/spark/spark-common/scalastyle-config.xml b/spark/spark-common/scalastyle-config.xml deleted file mode 100644 index 13d1c47..0000000 --- a/spark/spark-common/scalastyle-config.xml +++ /dev/null @@ -1,333 +0,0 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<!-- -If you wish to turn off checking for a section of code, you can put a comment in the source -before and after the section, with the following syntax: - - // scalastyle:off - ... // stuff that breaks the styles - // scalastyle:on - -You can also disable only one rule, by specifying its rule id, as specified in: - http://www.scalastyle.org/rules-0.7.0.html - - // scalastyle:off no.finalize - override def finalize(): Unit = ... - // scalastyle:on no.finalize - -This file is divided into 3 sections: - (1) rules that we enforce. - (2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet - (or we need to make the scalastyle rule more configurable). - (3) rules that we don't want to enforce. ---> - -<scalastyle> - <name>Scalastyle standard configuration</name> - - <!-- ================================================================================ --> - <!-- rules we enforce --> - <!-- ================================================================================ --> - - <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true"> - <parameters> - <parameter name="header"><![CDATA[/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */]]></parameter> - </parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true"> - <parameters> - <parameter name="maxLineLength"><![CDATA[100]]></parameter> - <parameter name="tabSize"><![CDATA[2]]></parameter> - <parameter name="ignoreImports">true</parameter> - </parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true"> - <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true"> - <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true"> - <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true"> - <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true"> - <parameters> - <parameter name="singleLineAllowed"><![CDATA[true]]></parameter> - <parameter name="doubleLineAllowed"><![CDATA[true]]></parameter> - </parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check> - - <check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check> - - <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true"> - <parameters> - <parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter> - </parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true"> - <parameters> - <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter> - </parameters> - </check> - - <!-- ??? usually shouldn't be checked into the code base. --> - <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check> - - <!-- As of SPARK-7977 all printlns need to be wrapped in '// scalastyle:off/on println' --> - <check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> - <parameters><parameter name="regex">^println$</parameter></parameters> - <customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with - // scalastyle:off println - println(...) - // scalastyle:on println]]></customMessage> - </check> - - <check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">@VisibleForTesting</parameter></parameters> - <customMessage><![CDATA[ - @VisibleForTesting causes classpath issues. Please note this in the java doc instead (SPARK-11615). - ]]></customMessage> - </check> - - <check customId="runtimeaddshutdownhook" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">Runtime\.getRuntime\.addShutdownHook</parameter></parameters> - <customMessage><![CDATA[ - Are you sure that you want to use Runtime.getRuntime.addShutdownHook? In most cases, you should use - ShutdownHookManager.addShutdownHook instead. - If you must use Runtime.getRuntime.addShutdownHook, wrap the code block with - // scalastyle:off runtimeaddshutdownhook - Runtime.getRuntime.addShutdownHook(...) - // scalastyle:on runtimeaddshutdownhook - ]]></customMessage> - </check> - - <check customId="mutablesynchronizedbuffer" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">mutable\.SynchronizedBuffer</parameter></parameters> - <customMessage><![CDATA[ - Are you sure that you want to use mutable.SynchronizedBuffer? In most cases, you should use - java.util.concurrent.ConcurrentLinkedQueue instead. - If you must use mutable.SynchronizedBuffer, wrap the code block with - // scalastyle:off mutablesynchronizedbuffer - mutable.SynchronizedBuffer[...] - // scalastyle:on mutablesynchronizedbuffer - ]]></customMessage> - </check> - - <check customId="classforname" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">Class\.forName</parameter></parameters> - <customMessage><![CDATA[ - Are you sure that you want to use Class.forName? In most cases, you should use Utils.classForName instead. - If you must use Class.forName, wrap the code block with - // scalastyle:off classforname - Class.forName(...) - // scalastyle:on classforname - ]]></customMessage> - </check> - - <check customId="awaitresult" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">Await\.result</parameter></parameters> - <customMessage><![CDATA[ - Are you sure that you want to use Await.result? In most cases, you should use ThreadUtils.awaitResult instead. - If you must use Await.result, wrap the code block with - // scalastyle:off awaitresult - Await.result(...) - // scalastyle:on awaitresult - ]]></customMessage> - </check> - - <!-- As of SPARK-9613 JavaConversions should be replaced with JavaConverters --> - <check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> - <parameters><parameter name="regex">JavaConversions</parameter></parameters> - <customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import - scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage> - </check> - - <check customId="commonslang2" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> - <parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters> - <customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead - of Commons Lang 2 (package org.apache.commons.lang.*)</customMessage> - </check> - - <check level="error" class="org.scalastyle.scalariform.ImportOrderChecker" enabled="true"> - <parameters> - <parameter name="groups">java,scala,3rdParty,spark</parameter> - <parameter name="group.java">javax?\..*</parameter> - <parameter name="group.scala">scala\..*</parameter> - <parameter name="group.3rdParty">(?!org\.apache\.spark\.).*</parameter> - <parameter name="group.spark">org\.apache\.spark\..*</parameter> - </parameters> - </check> - - <check level="error" class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" enabled="true"> - <parameters> - <parameter name="tokens">COMMA</parameter> - </parameters> - </check> - - <!-- SPARK-3854: Single Space between ')' and '{' --> - <check customId="SingleSpaceBetweenRParenAndLCurlyBrace" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">\)\{</parameter></parameters> - <customMessage><![CDATA[ - Single Space between ')' and `{`. - ]]></customMessage> - </check> - - <check customId="NoScalaDoc" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">(?m)^(\s*)/[*][*].*$(\r|)\n^\1 [*]</parameter></parameters> - <customMessage>Use Javadoc style indentation for multiline comments</customMessage> - </check> - - <check customId="OmitBracesInCase" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> - <parameters><parameter name="regex">case[^\n>]*=>\s*\{</parameter></parameters> - <customMessage>Omit braces in case clauses.</customMessage> - </check> - - <!-- SPARK-16877: Avoid Java annotations --> - <check customId="OverrideJavaCase" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true"> - <parameters><parameter name="regex">^Override$</parameter></parameters> - <customMessage>override modifier should be used instead of @java.lang.Override.</customMessage> - </check> - - <check level="error" class="org.scalastyle.scalariform.DeprecatedJavaChecker" enabled="true"></check> - - <!-- ================================================================================ --> - <!-- rules we'd like to enforce, but haven't cleaned up the codebase yet --> - <!-- ================================================================================ --> - - <!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. --> - <!-- Ideally the following two rules should be configurable to rule out string interpolation. --> - <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check> - <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check> - - <!-- This breaks symbolic method names so we don't turn it on. --> - <!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. --> - <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false"> - <parameters> - <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter> - </parameters> - </check> - - <!-- Should turn this on, but we have a few places that need to be fixed first --> - <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check> - - <!-- ================================================================================ --> - <!-- rules we don't want --> - <!-- ================================================================================ --> - - <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false"> - <parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters> - </check> - - <!-- We want the opposite of this: NewLineAtEofChecker --> - <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check> - - <!-- This one complains about all kinds of random things. Disable. --> - <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check> - - <!-- We use return quite a bit for control flows and guards --> - <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check> - - <!-- We use null a lot in low level code and to interface with 3rd party code --> - <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check> - - <!-- Doesn't seem super big deal here ... --> - <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"></check> - - <!-- Doesn't seem super big deal here ... --> - <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false"> - <parameters><parameter name="maxFileLength">800></parameter></parameters> - </check> - - <!-- Doesn't seem super big deal here ... --> - <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false"> - <parameters><parameter name="maxTypes">30</parameter></parameters> - </check> - - <!-- Doesn't seem super big deal here ... --> - <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false"> - <parameters><parameter name="maximum">10</parameter></parameters> - </check> - - <!-- Doesn't seem super big deal here ... --> - <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false"> - <parameters><parameter name="maxLength">50</parameter></parameters> - </check> - - <!-- Not exactly feasible to enforce this right now. --> - <!-- It is also infrequent that somebody introduces a new class with a lot of methods. --> - <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false"> - <parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters> - </check> - - <!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... --> - <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false"> - <parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters> - </check> - -</scalastyle>