spark git commit: [SAPRK-15220][UI] add hyperlink to running application and completed application
Repository: spark Updated Branches: refs/heads/master ee6a8d7ea -> f8aca5b4a [SAPRK-15220][UI] add hyperlink to running application and completed application ## What changes were proposed in this pull request? Add hyperlink to "running application" and "completed application", so user can jump to application table directly, In my environment, I set up 1000+ works and it's painful to scroll down to skip worker list. ## How was this patch tested? manual tested (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) ![sceenshot](https://cloud.githubusercontent.com/assets/13216322/15105718/97e06768-15f6-11e6-809d-3574046751a9.png) Author: mwwsCloses #12997 from mwws/SPARK_UI. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f8aca5b4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f8aca5b4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f8aca5b4 Branch: refs/heads/master Commit: f8aca5b4a98ee16c296aa7850925fdc756813b87 Parents: ee6a8d7 Author: mwws Authored: Mon May 9 11:17:14 2016 -0700 Committer: Andrew Or Committed: Mon May 9 11:17:14 2016 -0700 -- .../scala/org/apache/spark/deploy/master/ui/MasterPage.scala | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f8aca5b4/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala index 75de3ed..5ed3e39 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala @@ -114,8 +114,8 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") { {Utils.megabytesToString(aliveWorkers.map(_.memory).sum)} Total, {Utils.megabytesToString(aliveWorkers.map(_.memoryUsed).sum)} Used Applications: -{state.activeApps.length} Running, -{state.completedApps.length} Completed +{state.activeApps.length} Running, +{state.completedApps.length} Completed Drivers: {state.activeDrivers.length} Running, {state.completedDrivers.length} Completed @@ -133,7 +133,7 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") { - Running Applications + Running Applications {activeAppsTable} @@ -152,7 +152,7 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") { - Completed Applications + Completed Applications {completedAppsTable} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15210][SQL] Add missing @DeveloperApi annotation in sql.types
Repository: spark Updated Branches: refs/heads/master f8aca5b4a -> dfdcab00c [SPARK-15210][SQL] Add missing @DeveloperApi annotation in sql.types add DeveloperApi annotation for `AbstractDataType` `MapType` `UserDefinedType` local build Author: Zheng RuiFengCloses #12982 from zhengruifeng/types_devapi. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dfdcab00 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dfdcab00 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dfdcab00 Branch: refs/heads/master Commit: dfdcab00c7b6200c22883baa3ebc5818be09556f Parents: f8aca5b Author: Zheng RuiFeng Authored: Mon May 9 11:20:48 2016 -0700 Committer: Andrew Or Committed: Mon May 9 11:21:16 2016 -0700 -- .../main/scala/org/apache/spark/sql/types/AbstractDataType.scala | 2 ++ .../src/main/scala/org/apache/spark/sql/types/MapType.scala | 2 ++ .../main/scala/org/apache/spark/sql/types/UserDefinedType.scala | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dfdcab00/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala index 90af10f..03ea349 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.types import scala.reflect.ClassTag import scala.reflect.runtime.universe.{runtimeMirror, TypeTag} +import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.ScalaReflectionLock import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.util.Utils @@ -141,6 +142,7 @@ protected[sql] abstract class AtomicType extends DataType { * :: DeveloperApi :: * Numeric data types. */ +@DeveloperApi abstract class NumericType extends AtomicType { // Unfortunately we can't get this implicitly as that breaks Spark Serialization. In order for // implicitly[Numeric[JvmType]] to be valid, we have to change JvmType from a type variable to a http://git-wip-us.apache.org/repos/asf/spark/blob/dfdcab00/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala index 5474954..454ea40 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.types import org.json4s.JsonAST.JValue import org.json4s.JsonDSL._ +import org.apache.spark.annotation.DeveloperApi /** * :: DeveloperApi :: @@ -31,6 +32,7 @@ import org.json4s.JsonDSL._ * @param valueType The data type of map values. * @param valueContainsNull Indicates if map values have `null` values. */ +@DeveloperApi case class MapType( keyType: DataType, valueType: DataType, http://git-wip-us.apache.org/repos/asf/spark/blob/dfdcab00/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala index aa36121..8946313 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala @@ -96,11 +96,12 @@ abstract class UserDefinedType[UserType >: Null] extends DataType with Serializa } /** - * ::DeveloperApi:: + * :: DeveloperApi :: * The user defined type in Python. * * Note: This can only be accessed via Python UDF, or accessed as serialized object. */ +@DeveloperApi private[sql] class PythonUserDefinedType( val sqlType: DataType, override val pyUDT: String, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15210][SQL] Add missing @DeveloperApi annotation in sql.types
Repository: spark Updated Branches: refs/heads/branch-2.0 c6d23b660 -> f81d25139 [SPARK-15210][SQL] Add missing @DeveloperApi annotation in sql.types add DeveloperApi annotation for `AbstractDataType` `MapType` `UserDefinedType` local build Author: Zheng RuiFengCloses #12982 from zhengruifeng/types_devapi. (cherry picked from commit dfdcab00c7b6200c22883baa3ebc5818be09556f) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f81d2513 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f81d2513 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f81d2513 Branch: refs/heads/branch-2.0 Commit: f81d251393e2263411df8cf8a785b81f1f3d8b56 Parents: c6d23b6 Author: Zheng RuiFeng Authored: Mon May 9 11:20:48 2016 -0700 Committer: Andrew Or Committed: Mon May 9 11:21:32 2016 -0700 -- .../main/scala/org/apache/spark/sql/types/AbstractDataType.scala | 2 ++ .../src/main/scala/org/apache/spark/sql/types/MapType.scala | 2 ++ .../main/scala/org/apache/spark/sql/types/UserDefinedType.scala | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f81d2513/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala index 90af10f..03ea349 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.types import scala.reflect.ClassTag import scala.reflect.runtime.universe.{runtimeMirror, TypeTag} +import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.ScalaReflectionLock import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.util.Utils @@ -141,6 +142,7 @@ protected[sql] abstract class AtomicType extends DataType { * :: DeveloperApi :: * Numeric data types. */ +@DeveloperApi abstract class NumericType extends AtomicType { // Unfortunately we can't get this implicitly as that breaks Spark Serialization. In order for // implicitly[Numeric[JvmType]] to be valid, we have to change JvmType from a type variable to a http://git-wip-us.apache.org/repos/asf/spark/blob/f81d2513/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala index 5474954..454ea40 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.types import org.json4s.JsonAST.JValue import org.json4s.JsonDSL._ +import org.apache.spark.annotation.DeveloperApi /** * :: DeveloperApi :: @@ -31,6 +32,7 @@ import org.json4s.JsonDSL._ * @param valueType The data type of map values. * @param valueContainsNull Indicates if map values have `null` values. */ +@DeveloperApi case class MapType( keyType: DataType, valueType: DataType, http://git-wip-us.apache.org/repos/asf/spark/blob/f81d2513/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala index aa36121..8946313 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala @@ -96,11 +96,12 @@ abstract class UserDefinedType[UserType >: Null] extends DataType with Serializa } /** - * ::DeveloperApi:: + * :: DeveloperApi :: * The user defined type in Python. * * Note: This can only be accessed via Python UDF, or accessed as serialized object. */ +@DeveloperApi private[sql] class PythonUserDefinedType( val sqlType: DataType, override val pyUDT: String, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15166][SQL] Move some hive-specific code from SparkSession
Repository: spark Updated Branches: refs/heads/master dfdcab00c -> 7bf9b1201 [SPARK-15166][SQL] Move some hive-specific code from SparkSession ## What changes were proposed in this pull request? This also simplifies the code being moved. ## How was this patch tested? Existing tests. Author: Andrew Or <and...@databricks.com> Closes #12941 from andrewor14/move-code. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7bf9b120 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7bf9b120 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7bf9b120 Branch: refs/heads/master Commit: 7bf9b12019bb20470b726a7233d60ce38a9c52cc Parents: dfdcab0 Author: Andrew Or <and...@databricks.com> Authored: Mon May 9 11:24:58 2016 -0700 Committer: Andrew Or <and...@databricks.com> Committed: Mon May 9 11:24:58 2016 -0700 -- .../scala/org/apache/spark/sql/SparkSession.scala | 13 - .../apache/spark/sql/hive/HiveSessionState.scala | 2 -- .../apache/spark/sql/hive/HiveSharedState.scala| 17 + 3 files changed, 13 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7bf9b120/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 2a893c6..c7fa8f7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -73,19 +73,6 @@ class SparkSession private( | Session-related state | * --- */ - { -val defaultWarehousePath = - SQLConf.WAREHOUSE_PATH -.defaultValueString -.replace("${system:user.dir}", System.getProperty("user.dir")) -val warehousePath = sparkContext.conf.get( - SQLConf.WAREHOUSE_PATH.key, - defaultWarehousePath) -sparkContext.conf.set(SQLConf.WAREHOUSE_PATH.key, warehousePath) -sparkContext.conf.set("hive.metastore.warehouse.dir", warehousePath) -logInfo(s"Setting warehouse location to $warehousePath") - } - /** * State shared across sessions, including the [[SparkContext]], cached data, listener, * and a catalog that interacts with external systems. http://git-wip-us.apache.org/repos/asf/spark/blob/7bf9b120/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala index 31f28f2..46579ec 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.hive -import org.apache.hadoop.hive.conf.HiveConf.ConfVars - import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.Analyzer import org.apache.spark.sql.execution.SparkPlanner http://git-wip-us.apache.org/repos/asf/spark/blob/7bf9b120/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala index 0ea5ce9..f0d9640 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala @@ -18,8 +18,9 @@ package org.apache.spark.sql.hive import org.apache.spark.SparkContext -import org.apache.spark.sql.hive.client.{HiveClient, HiveClientImpl} -import org.apache.spark.sql.internal.SharedState +import org.apache.spark.internal.Logging +import org.apache.spark.sql.hive.client.HiveClient +import org.apache.spark.sql.internal.{SharedState, SQLConf} /** @@ -27,9 +28,17 @@ import org.apache.spark.sql.internal.SharedState * [[org.apache.spark.sql.SparkSession]] backed by Hive. */ private[hive] class HiveSharedState(override val sparkContext: SparkContext) - extends SharedState(sparkContext) { + extends SharedState(sparkContext) with Logging { - // TODO: just share the IsolatedClientLoader instead of the client instances themselves + // TODO: just share the IsolatedClientLoader instead of the client instance itself + + { +// Set the Hive metastore warehouse path to the one we use +val tempConf = new SQLConf +sparkContext.conf
spark git commit: [SPARK-10653][CORE] Remove unnecessary things from SparkEnv
Repository: spark Updated Branches: refs/heads/branch-2.0 e3f000a36 -> 40d24686a [SPARK-10653][CORE] Remove unnecessary things from SparkEnv ## What changes were proposed in this pull request? Removed blockTransferService and sparkFilesDir from SparkEnv since they're rarely used and don't need to be in stored in the env. Edited their few usages to accommodate the change. ## How was this patch tested? ran dev/run-tests locally Author: Alex BozarthCloses #12970 from ajbozarth/spark10653. (cherry picked from commit c3e23bc0c3e87546d0575c3c4c45a2b0e2dfec6a) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/40d24686 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/40d24686 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/40d24686 Branch: refs/heads/branch-2.0 Commit: 40d24686aecc6b655b497b01303d0fdd4d4d480f Parents: e3f000a Author: Alex Bozarth Authored: Mon May 9 11:51:37 2016 -0700 Committer: Andrew Or Committed: Mon May 9 11:51:47 2016 -0700 -- .../main/scala/org/apache/spark/SparkEnv.scala | 26 .../scala/org/apache/spark/SparkFiles.scala | 2 +- .../org/apache/spark/storage/BlockManager.scala | 2 +- .../org/apache/spark/DistributedSuite.scala | 2 +- project/MimaExcludes.scala | 4 +++ 5 files changed, 12 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/40d24686/core/src/main/scala/org/apache/spark/SparkEnv.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala index 4bf8890..af50a6d 100644 --- a/core/src/main/scala/org/apache/spark/SparkEnv.scala +++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala @@ -31,7 +31,6 @@ import org.apache.spark.broadcast.BroadcastManager import org.apache.spark.internal.Logging import org.apache.spark.memory.{MemoryManager, StaticMemoryManager, UnifiedMemoryManager} import org.apache.spark.metrics.MetricsSystem -import org.apache.spark.network.BlockTransferService import org.apache.spark.network.netty.NettyBlockTransferService import org.apache.spark.rpc.{RpcEndpoint, RpcEndpointRef, RpcEnv} import org.apache.spark.scheduler.{LiveListenerBus, OutputCommitCoordinator} @@ -61,10 +60,8 @@ class SparkEnv ( val mapOutputTracker: MapOutputTracker, val shuffleManager: ShuffleManager, val broadcastManager: BroadcastManager, -val blockTransferService: BlockTransferService, val blockManager: BlockManager, val securityManager: SecurityManager, -val sparkFilesDir: String, val metricsSystem: MetricsSystem, val memoryManager: MemoryManager, val outputCommitCoordinator: OutputCommitCoordinator, @@ -77,7 +74,7 @@ class SparkEnv ( // (e.g., HadoopFileRDD uses this to cache JobConfs and InputFormats). private[spark] val hadoopJobMetadata = new MapMaker().softValues().makeMap[String, Any]() - private var driverTmpDirToDelete: Option[String] = None + private[spark] var driverTmpDir: Option[String] = None private[spark] def stop() { @@ -94,13 +91,10 @@ class SparkEnv ( rpcEnv.shutdown() rpcEnv.awaitTermination() - // Note that blockTransferService is stopped by BlockManager since it is started by it. - // If we only stop sc, but the driver process still run as a services then we need to delete // the tmp dir, if not, it will create too many tmp dirs. - // We only need to delete the tmp dir create by driver, because sparkFilesDir is point to the - // current working dir in executor which we do not need to delete. - driverTmpDirToDelete match { + // We only need to delete the tmp dir create by driver + driverTmpDir match { case Some(path) => try { Utils.deleteRecursively(new File(path)) @@ -342,15 +336,6 @@ object SparkEnv extends Logging { ms } -// Set the sparkFiles directory, used when downloading dependencies. In local mode, -// this is a temporary directory; in distributed mode, this is the executor's current working -// directory. -val sparkFilesDir: String = if (isDriver) { - Utils.createTempDir(Utils.getLocalDir(conf), "userFiles").getAbsolutePath -} else { - "." -} - val outputCommitCoordinator = mockOutputCommitCoordinator.getOrElse { new OutputCommitCoordinator(conf, isDriver) } @@ -367,10 +352,8 @@ object SparkEnv extends Logging { mapOutputTracker, shuffleManager, broadcastManager, - blockTransferService, blockManager,
spark git commit: [HOTFIX] SQL test compilation error from merge conflict
Repository: spark Updated Branches: refs/heads/master 5c6b08557 -> cddb9da07 [HOTFIX] SQL test compilation error from merge conflict Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cddb9da0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cddb9da0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cddb9da0 Branch: refs/heads/master Commit: cddb9da074b3dfeef34b7fdb8d9a8b16513a819d Parents: 5c6b085 Author: Andrew OrAuthored: Tue May 10 11:46:02 2016 -0700 Committer: Andrew Or Committed: Tue May 10 11:46:02 2016 -0700 -- .../scala/org/apache/spark/sql/internal/CatalogSuite.scala | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cddb9da0/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala index 56f848b..d8a2c38 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala @@ -279,10 +279,10 @@ class CatalogSuite assert(tableFields == Seq("nama", "databasa", "descripta", "typa", false)) assert(functionFields == Seq("nama", "descripta", "classa", false)) assert(columnFields == Seq("nama", "descripta", "typa", false, true, true)) -val dbString = CatalogImpl.makeDataset(Seq(db), sparkSession).showString(10) -val tableString = CatalogImpl.makeDataset(Seq(table), sparkSession).showString(10) -val functionString = CatalogImpl.makeDataset(Seq(function), sparkSession).showString(10) -val columnString = CatalogImpl.makeDataset(Seq(column), sparkSession).showString(10) +val dbString = CatalogImpl.makeDataset(Seq(db), spark).showString(10) +val tableString = CatalogImpl.makeDataset(Seq(table), spark).showString(10) +val functionString = CatalogImpl.makeDataset(Seq(function), spark).showString(10) +val columnString = CatalogImpl.makeDataset(Seq(column), spark).showString(10) dbFields.foreach { f => assert(dbString.contains(f.toString)) } tableFields.foreach { f => assert(tableString.contains(f.toString)) } functionFields.foreach { f => assert(functionString.contains(f.toString)) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15037][HOTFIX] Don't create 2 SparkSessions in constructor
Repository: spark Updated Branches: refs/heads/master db3b4a201 -> 69641066a [SPARK-15037][HOTFIX] Don't create 2 SparkSessions in constructor ## What changes were proposed in this pull request? After #12907 `TestSparkSession` creates a spark session in one of the constructors just to get the `SparkContext` from it. This ends up creating 2 `SparkSession`s from one call, which is definitely not what we want. ## How was this patch tested? Jenkins. Author: Andrew Or <and...@databricks.com> Closes #13031 from andrewor14/sql-test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69641066 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69641066 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69641066 Branch: refs/heads/master Commit: 69641066ae1d35c33b082451cef636a7f2e646d9 Parents: db3b4a2 Author: Andrew Or <and...@databricks.com> Authored: Tue May 10 12:07:47 2016 -0700 Committer: Andrew Or <and...@databricks.com> Committed: Tue May 10 12:07:47 2016 -0700 -- .../org/apache/spark/sql/test/TestSQLContext.scala | 12 +--- 1 file changed, 1 insertion(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/69641066/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala index 785e345..2f247ca 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala @@ -31,17 +31,7 @@ private[sql] class TestSparkSession(sc: SparkContext) extends SparkSession(sc) { } def this() { -this { - val conf = new SparkConf() - conf.set("spark.sql.testkey", "true") - - val spark = SparkSession.builder -.master("local[2]") -.appName("test-sql-context") -.config(conf) -.getOrCreate() - spark.sparkContext -} +this(new SparkConf) } @transient - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14603][SQL] Verification of Metadata Operations by Session Catalog
Repository: spark Updated Branches: refs/heads/master ed0b4070f -> 5c6b08557 [SPARK-14603][SQL] Verification of Metadata Operations by Session Catalog Since we cannot really trust if the underlying external catalog can throw exceptions when there is an invalid metadata operation, let's do it in SessionCatalog. - [X] The first step is to unify the error messages issued in Hive-specific Session Catalog and general Session Catalog. - [X] The second step is to verify the inputs of metadata operations for partitioning-related operations. This is moved to a separate PR: https://github.com/apache/spark/pull/12801 - [X] The third step is to add database existence verification in `SessionCatalog` - [X] The fourth step is to add table existence verification in `SessionCatalog` - [X] The fifth step is to add function existence verification in `SessionCatalog` Add test cases and verify the error messages we issued Author: gatorsmileAuthor: xiaoli Author: Xiao Li Closes #12385 from gatorsmile/verifySessionAPIs. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c6b0855 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c6b0855 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c6b0855 Branch: refs/heads/master Commit: 5c6b0855787c080d3e233eb09c05c025395e7cb3 Parents: ed0b407 Author: gatorsmile Authored: Tue May 10 11:25:39 2016 -0700 Committer: Andrew Or Committed: Tue May 10 11:25:55 2016 -0700 -- python/pyspark/sql/utils.py | 2 + .../analysis/AlreadyExistException.scala| 49 ++ .../catalyst/analysis/NoSuchItemException.scala | 8 + .../sql/catalyst/catalog/ExternalCatalog.scala | 6 +- .../sql/catalyst/catalog/InMemoryCatalog.scala | 37 ++--- .../sql/catalyst/catalog/SessionCatalog.scala | 84 +-- .../catalyst/catalog/SessionCatalogSuite.scala | 148 --- .../spark/sql/execution/command/DDLSuite.scala | 14 +- .../sql/hive/execution/HiveCommandSuite.scala | 6 +- .../sql/hive/execution/SQLQuerySuite.scala | 30 ++-- 10 files changed, 261 insertions(+), 123 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5c6b0855/python/pyspark/sql/utils.py -- diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index cb172d2..36c9322 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -61,6 +61,8 @@ def capture_sql_exception(f): e.java_exception.getStackTrace())) if s.startswith('org.apache.spark.sql.AnalysisException: '): raise AnalysisException(s.split(': ', 1)[1], stackTrace) +if s.startswith('org.apache.spark.sql.catalyst.analysis.NoSuchTableException: '): +raise AnalysisException(s.split(': ', 1)[1], stackTrace) if s.startswith('org.apache.spark.sql.catalyst.parser.ParseException: '): raise ParseException(s.split(': ', 1)[1], stackTrace) if s.startswith('org.apache.spark.sql.ContinuousQueryException: '): http://git-wip-us.apache.org/repos/asf/spark/blob/5c6b0855/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala new file mode 100644 index 000..ec56fe7 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.AnalysisException +import
spark git commit: [SPARK-15037][HOTFIX] Replace `sqlContext` and `sparkSession` with `spark`.
Repository: spark Updated Branches: refs/heads/master cddb9da07 -> db3b4a201 [SPARK-15037][HOTFIX] Replace `sqlContext` and `sparkSession` with `spark`. This replaces `sparkSession` with `spark` in CatalogSuite.scala. Pass the Jenkins tests. Author: Dongjoon HyunCloses #13030 from dongjoon-hyun/hotfix_sparkSession. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db3b4a20 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db3b4a20 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db3b4a20 Branch: refs/heads/master Commit: db3b4a20150ff7fb1caaf62ab3d2a2f1e632af36 Parents: cddb9da Author: Dongjoon Hyun Authored: Tue May 10 11:53:41 2016 -0700 Committer: Andrew Or Committed: Tue May 10 11:53:44 2016 -0700 -- .../scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/db3b4a20/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 6dcc404..8b60802 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -536,7 +536,7 @@ class HiveDDLSuite withTable("t1") { withTempPath { dir => val path = dir.getCanonicalPath -sqlContext.range(1).write.parquet(path) +spark.range(1).write.parquet(path) sql(s"CREATE TABLE t1 USING parquet OPTIONS (PATH '$path')") val desc = sql("DESC FORMATTED t1").collect().toSeq @@ -548,7 +548,7 @@ class HiveDDLSuite test("desc table for data source table - partitioned bucketed table") { withTable("t1") { - sqlContext + spark .range(1).select('id as 'a, 'id as 'b, 'id as 'c, 'id as 'd).write .bucketBy(2, "b").sortBy("c").partitionBy("d") .saveAsTable("t1") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15037][HOTFIX] Replace `sqlContext` and `sparkSession` with `spark`.
Repository: spark Updated Branches: refs/heads/branch-2.0 42db140c5 -> bd7fd14c9 [SPARK-15037][HOTFIX] Replace `sqlContext` and `sparkSession` with `spark`. This replaces `sparkSession` with `spark` in CatalogSuite.scala. Pass the Jenkins tests. Author: Dongjoon HyunCloses #13030 from dongjoon-hyun/hotfix_sparkSession. (cherry picked from commit db3b4a20150ff7fb1caaf62ab3d2a2f1e632af36) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bd7fd14c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bd7fd14c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bd7fd14c Branch: refs/heads/branch-2.0 Commit: bd7fd14c93746556bd99faa640fb6b95defef148 Parents: 42db140 Author: Dongjoon Hyun Authored: Tue May 10 11:53:41 2016 -0700 Committer: Andrew Or Committed: Tue May 10 11:53:59 2016 -0700 -- .../scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bd7fd14c/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 6dcc404..8b60802 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -536,7 +536,7 @@ class HiveDDLSuite withTable("t1") { withTempPath { dir => val path = dir.getCanonicalPath -sqlContext.range(1).write.parquet(path) +spark.range(1).write.parquet(path) sql(s"CREATE TABLE t1 USING parquet OPTIONS (PATH '$path')") val desc = sql("DESC FORMATTED t1").collect().toSeq @@ -548,7 +548,7 @@ class HiveDDLSuite test("desc table for data source table - partitioned bucketed table") { withTable("t1") { - sqlContext + spark .range(1).select('id as 'a, 'id as 'b, 'id as 'c, 'id as 'd).write .bucketBy(2, "b").sortBy("c").partitionBy("d") .saveAsTable("t1") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14603][SQL] Verification of Metadata Operations by Session Catalog
Repository: spark Updated Branches: refs/heads/branch-2.0 5bf74b44d -> 42db140c5 [SPARK-14603][SQL] Verification of Metadata Operations by Session Catalog Since we cannot really trust if the underlying external catalog can throw exceptions when there is an invalid metadata operation, let's do it in SessionCatalog. - [X] The first step is to unify the error messages issued in Hive-specific Session Catalog and general Session Catalog. - [X] The second step is to verify the inputs of metadata operations for partitioning-related operations. This is moved to a separate PR: https://github.com/apache/spark/pull/12801 - [X] The third step is to add database existence verification in `SessionCatalog` - [X] The fourth step is to add table existence verification in `SessionCatalog` - [X] The fifth step is to add function existence verification in `SessionCatalog` Add test cases and verify the error messages we issued Author: gatorsmileAuthor: xiaoli Author: Xiao Li Closes #12385 from gatorsmile/verifySessionAPIs. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/42db140c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/42db140c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/42db140c Branch: refs/heads/branch-2.0 Commit: 42db140c5e134fe442d5160836f576f202aa17e5 Parents: 5bf74b4 Author: gatorsmile Authored: Tue May 10 11:25:39 2016 -0700 Committer: Andrew Or Committed: Tue May 10 11:28:04 2016 -0700 -- python/pyspark/sql/utils.py | 2 + .../analysis/AlreadyExistException.scala| 49 ++ .../catalyst/analysis/NoSuchItemException.scala | 8 + .../sql/catalyst/catalog/ExternalCatalog.scala | 6 +- .../sql/catalyst/catalog/InMemoryCatalog.scala | 37 ++--- .../sql/catalyst/catalog/SessionCatalog.scala | 84 +-- .../catalyst/catalog/SessionCatalogSuite.scala | 148 --- .../spark/sql/execution/command/DDLSuite.scala | 14 +- .../sql/hive/execution/HiveCommandSuite.scala | 6 +- .../sql/hive/execution/SQLQuerySuite.scala | 30 ++-- 10 files changed, 261 insertions(+), 123 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/42db140c/python/pyspark/sql/utils.py -- diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index cb172d2..36c9322 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -61,6 +61,8 @@ def capture_sql_exception(f): e.java_exception.getStackTrace())) if s.startswith('org.apache.spark.sql.AnalysisException: '): raise AnalysisException(s.split(': ', 1)[1], stackTrace) +if s.startswith('org.apache.spark.sql.catalyst.analysis.NoSuchTableException: '): +raise AnalysisException(s.split(': ', 1)[1], stackTrace) if s.startswith('org.apache.spark.sql.catalyst.parser.ParseException: '): raise ParseException(s.split(': ', 1)[1], stackTrace) if s.startswith('org.apache.spark.sql.ContinuousQueryException: '): http://git-wip-us.apache.org/repos/asf/spark/blob/42db140c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala new file mode 100644 index 000..ec56fe7 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.AnalysisException
spark git commit: [SPARK-15249][SQL] Use FunctionResource instead of (String, String) in CreateFunction and CatalogFunction for resource
Repository: spark Updated Branches: refs/heads/master 9533f5390 -> da02d006b [SPARK-15249][SQL] Use FunctionResource instead of (String, String) in CreateFunction and CatalogFunction for resource Use FunctionResource instead of (String, String) in CreateFunction and CatalogFunction for resource see: TODO's here https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala#L36 https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala#L42 Existing tests Author: Sandeep SinghCloses #13024 from techaddict/SPARK-15249. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/da02d006 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/da02d006 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/da02d006 Branch: refs/heads/master Commit: da02d006bbb5c4fe62abd5542b9fff7d1c58603c Parents: 9533f53 Author: Sandeep Singh Authored: Tue May 10 14:21:47 2016 -0700 Committer: Andrew Or Committed: Tue May 10 14:22:03 2016 -0700 -- .../apache/spark/sql/catalyst/catalog/SessionCatalog.scala | 8 ++-- .../spark/sql/catalyst/catalog/functionResources.scala | 8 .../org/apache/spark/sql/catalyst/catalog/interface.scala | 3 +-- .../spark/sql/catalyst/catalog/ExternalCatalogSuite.scala | 4 ++-- .../spark/sql/catalyst/catalog/SessionCatalogSuite.scala| 2 +- .../org/apache/spark/sql/execution/SparkSqlParser.scala | 4 ++-- .../org/apache/spark/sql/execution/command/functions.scala | 5 ++--- .../spark/sql/execution/command/DDLCommandSuite.scala | 9 +++-- .../org/apache/spark/sql/hive/client/HiveClientImpl.scala | 7 --- .../org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala| 4 ++-- 10 files changed, 27 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/da02d006/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 7505e2c..f53311c5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -687,12 +687,8 @@ class SessionCatalog( * Loads resources such as JARs and Files for a function. Every resource is represented * by a tuple (resource type, resource uri). */ - def loadFunctionResources(resources: Seq[(String, String)]): Unit = { -resources.foreach { case (resourceType, uri) => - val functionResource = - FunctionResource(FunctionResourceType.fromString(resourceType.toLowerCase), uri) - functionResourceLoader.loadResource(functionResource) -} + def loadFunctionResources(resources: Seq[FunctionResource]): Unit = { +resources.foreach(functionResourceLoader.loadResource) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/da02d006/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/functionResources.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/functionResources.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/functionResources.scala index 5adcc89..7da1fe9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/functionResources.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/functionResources.scala @@ -20,16 +20,16 @@ package org.apache.spark.sql.catalyst.catalog import org.apache.spark.sql.AnalysisException /** An trait that represents the type of a resourced needed by a function. */ -sealed trait FunctionResourceType +abstract class FunctionResourceType(val resourceType: String) -object JarResource extends FunctionResourceType +object JarResource extends FunctionResourceType("jar") -object FileResource extends FunctionResourceType +object FileResource extends FunctionResourceType("file") // We do not allow users to specify a archive because it is YARN specific. // When loading resources, we will throw an exception and ask users to // use --archive with spark submit. -object ArchiveResource extends FunctionResourceType +object ArchiveResource extends FunctionResourceType("archive") object FunctionResourceType { def fromString(resourceType: String): FunctionResourceType = {
[2/3] spark git commit: [SPARK-13522][CORE] Executor should kill itself when it's unable to heartbeat to driver more than N times
[SPARK-13522][CORE] Executor should kill itself when it's unable to heartbeat to driver more than N times ## What changes were proposed in this pull request? Sometimes, network disconnection event won't be triggered for other potential race conditions that we may not have thought of, then the executor will keep sending heartbeats to driver and won't exit. This PR adds a new configuration `spark.executor.heartbeat.maxFailures` to kill Executor when it's unable to heartbeat to the driver more than `spark.executor.heartbeat.maxFailures` times. ## How was this patch tested? unit tests Author: Shixiong ZhuCloses #11401 from zsxwing/SPARK-13522. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/86bf93e6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/86bf93e6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/86bf93e6 Branch: refs/heads/branch-1.6 Commit: 86bf93e65481b8fe5d7532ca6d4cd29cafc9e9dd Parents: c433c0a Author: Shixiong Zhu Authored: Mon Feb 29 11:02:45 2016 -0800 Committer: Andrew Or Committed: Wed May 11 11:29:06 2016 -0700 -- .../org/apache/spark/executor/Executor.scala| 22 +++- .../spark/executor/ExecutorExitCode.scala | 8 +++ 2 files changed, 29 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/86bf93e6/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index b248e12..b8a1668 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -114,6 +114,19 @@ private[spark] class Executor( private val heartbeatReceiverRef = RpcUtils.makeDriverRef(HeartbeatReceiver.ENDPOINT_NAME, conf, env.rpcEnv) + /** + * When an executor is unable to send heartbeats to the driver more than `HEARTBEAT_MAX_FAILURES` + * times, it should kill itself. The default value is 60. It means we will retry to send + * heartbeats about 10 minutes because the heartbeat interval is 10s. + */ + private val HEARTBEAT_MAX_FAILURES = conf.getInt("spark.executor.heartbeat.maxFailures", 60) + + /** + * Count the failure times of heartbeat. It should only be acessed in the heartbeat thread. Each + * successful heartbeat will reset it to 0. + */ + private var heartbeatFailures = 0 + startDriverHeartbeater() def launchTask( @@ -464,8 +477,15 @@ private[spark] class Executor( logInfo("Told to re-register on heartbeat") env.blockManager.reregister() } + heartbeatFailures = 0 } catch { - case NonFatal(e) => logWarning("Issue communicating with driver in heartbeater", e) + case NonFatal(e) => +logWarning("Issue communicating with driver in heartbeater", e) +logError(s"Unable to send heartbeats to driver more than $HEARTBEAT_MAX_FAILURES times") +heartbeatFailures += 1 +if (heartbeatFailures >= HEARTBEAT_MAX_FAILURES) { + System.exit(ExecutorExitCode.HEARTBEAT_FAILURE) +} } } http://git-wip-us.apache.org/repos/asf/spark/blob/86bf93e6/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala index ea36fb6..99858f7 100644 --- a/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala +++ b/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala @@ -39,6 +39,12 @@ object ExecutorExitCode { /** ExternalBlockStore failed to create a local temporary directory after many attempts. */ val EXTERNAL_BLOCK_STORE_FAILED_TO_CREATE_DIR = 55 + /** + * Executor is unable to send heartbeats to the driver more than + * "spark.executor.heartbeat.maxFailures" times. + */ + val HEARTBEAT_FAILURE = 56 + def explainExitCode(exitCode: Int): String = { exitCode match { case UNCAUGHT_EXCEPTION => "Uncaught exception" @@ -51,6 +57,8 @@ object ExecutorExitCode { // TODO: replace external block store with concrete implementation name case EXTERNAL_BLOCK_STORE_FAILED_TO_CREATE_DIR => "ExternalBlockStore failed to create a local temporary directory." + case HEARTBEAT_FAILURE => +"Unable to send heartbeats to driver." case _ => "Unknown executor exit code (" + exitCode + ")" + ( if (exitCode > 128) {
[1/3] spark git commit: [SPARK-13519][CORE] Driver should tell Executor to stop itself when cleaning executor's state
Repository: spark Updated Branches: refs/heads/branch-1.6 d1654864a -> ced71d353 [SPARK-13519][CORE] Driver should tell Executor to stop itself when cleaning executor's state ## What changes were proposed in this pull request? When the driver removes an executor's state, the connection between the driver and the executor may be still alive so that the executor cannot exit automatically (E.g., Master will send RemoveExecutor when a work is lost but the executor is still alive), so the driver should try to tell the executor to stop itself. Otherwise, we will leak an executor. This PR modified the driver to send `StopExecutor` to the executor when it's removed. ## How was this patch tested? manual test: increase the worker heartbeat interval to force it's always timeout and the leak executors are gone. Author: Shixiong ZhuCloses #11399 from zsxwing/SPARK-13519. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c433c0af Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c433c0af Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c433c0af Branch: refs/heads/branch-1.6 Commit: c433c0afd4c3f96ef24686a1f28262af81b67723 Parents: d165486 Author: Shixiong Zhu Authored: Fri Feb 26 15:11:57 2016 -0800 Committer: Andrew Or Committed: Wed May 11 11:29:01 2016 -0700 -- .../spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c433c0af/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 505c161..7189685 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -179,6 +179,10 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp context.reply(true) case RemoveExecutor(executorId, reason) => +// We will remove the executor's state and cannot restore it. However, the connection +// between the driver and the executor may be still alive so that the executor won't exit +// automatically, so try to tell the executor to stop itself. See SPARK-13519. + executorDataMap.get(executorId).foreach(_.executorEndpoint.send(StopExecutor)) removeExecutor(executorId, reason) context.reply(true) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[3/3] spark git commit: [SPARK-13522][CORE] Fix the exit log place for heartbeat
[SPARK-13522][CORE] Fix the exit log place for heartbeat ## What changes were proposed in this pull request? Just fixed the log place introduced by #11401 ## How was this patch tested? unit tests. Author: Shixiong ZhuCloses #11432 from zsxwing/SPARK-13522-follow-up. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ced71d35 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ced71d35 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ced71d35 Branch: refs/heads/branch-1.6 Commit: ced71d353a0908abcf5b83503661bef97ae0953d Parents: 86bf93e Author: Shixiong Zhu Authored: Mon Feb 29 11:52:11 2016 -0800 Committer: Andrew Or Committed: Wed May 11 11:29:10 2016 -0700 -- core/src/main/scala/org/apache/spark/executor/Executor.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ced71d35/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index b8a1668..a3ebaff 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -481,9 +481,10 @@ private[spark] class Executor( } catch { case NonFatal(e) => logWarning("Issue communicating with driver in heartbeater", e) -logError(s"Unable to send heartbeats to driver more than $HEARTBEAT_MAX_FAILURES times") heartbeatFailures += 1 if (heartbeatFailures >= HEARTBEAT_MAX_FAILURES) { + logError(s"Exit as unable to send heartbeats to driver " + +s"more than $HEARTBEAT_MAX_FAILURES times") System.exit(ExecutorExitCode.HEARTBEAT_FAILURE) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [HOTFIX] SQL test compilation error from merge conflict
Repository: spark Updated Branches: refs/heads/branch-2.0 9098b1a17 -> b3f145442 [HOTFIX] SQL test compilation error from merge conflict Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3f14544 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3f14544 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3f14544 Branch: refs/heads/branch-2.0 Commit: b3f145442a4419a43a13960bb2a45d28ce41bfc4 Parents: 9098b1a Author: Andrew OrAuthored: Tue May 10 11:46:02 2016 -0700 Committer: Andrew Or Committed: Thu May 12 09:20:43 2016 -0700 -- .../scala/org/apache/spark/sql/internal/CatalogSuite.scala | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b3f14544/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala index 94f77bc..e4d4cec 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala @@ -279,10 +279,10 @@ class CatalogSuite assert(tableFields == Seq("nama", "databasa", "descripta", "typa", false)) assert(functionFields == Seq("nama", "descripta", "classa", false)) assert(columnFields == Seq("nama", "descripta", "typa", false, true, true)) -val dbString = CatalogImpl.makeDataset(Seq(db), sparkSession).showString(10) -val tableString = CatalogImpl.makeDataset(Seq(table), sparkSession).showString(10) -val functionString = CatalogImpl.makeDataset(Seq(function), sparkSession).showString(10) -val columnString = CatalogImpl.makeDataset(Seq(column), sparkSession).showString(10) +val dbString = CatalogImpl.makeDataset(Seq(db), spark).showString(10) +val tableString = CatalogImpl.makeDataset(Seq(table), spark).showString(10) +val functionString = CatalogImpl.makeDataset(Seq(function), spark).showString(10) +val columnString = CatalogImpl.makeDataset(Seq(column), spark).showString(10) dbFields.foreach { f => assert(dbString.contains(f.toString)) } tableFields.foreach { f => assert(tableString.contains(f.toString)) } functionFields.foreach { f => assert(functionString.contains(f.toString)) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15276][SQL] CREATE TABLE with LOCATION should imply EXTERNAL
Repository: spark Updated Branches: refs/heads/master b9cf617a6 -> f14c4ba00 [SPARK-15276][SQL] CREATE TABLE with LOCATION should imply EXTERNAL ## What changes were proposed in this pull request? Before: ```sql -- uses that location but issues a warning CREATE TABLE my_tab LOCATION /some/path -- deletes any existing data in the specified location DROP TABLE my_tab ``` After: ```sql -- uses that location but creates an EXTERNAL table instead CREATE TABLE my_tab LOCATION /some/path -- does not delete the data at /some/path DROP TABLE my_tab ``` This patch essentially makes the `EXTERNAL` field optional. This is related to #13032. ## How was this patch tested? New test in `DDLCommandSuite`. Author: Andrew Or <and...@databricks.com> Closes #13060 from andrewor14/location-implies-external. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f14c4ba0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f14c4ba0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f14c4ba0 Branch: refs/heads/master Commit: f14c4ba001fbdbcc9faa46896f1f9d08a7d06609 Parents: b9cf617 Author: Andrew Or <and...@databricks.com> Authored: Wed May 11 17:29:58 2016 -0700 Committer: Andrew Or <and...@databricks.com> Committed: Wed May 11 17:29:58 2016 -0700 -- .../org/apache/spark/sql/execution/SparkSqlParser.scala | 12 +++- .../spark/sql/execution/command/DDLCommandSuite.scala | 12 .../apache/spark/sql/hive/execution/HiveDDLSuite.scala | 8 +++- .../apache/spark/sql/hive/execution/SQLQuerySuite.scala | 5 + 4 files changed, 23 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f14c4ba0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index a51665f..53aba1f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -745,11 +745,6 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { if (ctx.bucketSpec != null) { throw operationNotAllowed("CREATE TABLE ... CLUSTERED BY", ctx) } -val tableType = if (external) { - CatalogTableType.EXTERNAL -} else { - CatalogTableType.MANAGED -} val comment = Option(ctx.STRING).map(string) val partitionCols = Option(ctx.partitionColumns).toSeq.flatMap(visitCatalogColumns) val cols = Option(ctx.columns).toSeq.flatMap(visitCatalogColumns) @@ -791,6 +786,13 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { serde = rowStorage.serde.orElse(fileStorage.serde).orElse(defaultStorage.serde), compressed = false, serdeProperties = rowStorage.serdeProperties ++ fileStorage.serdeProperties) +// If location is defined, we'll assume this is an external table. +// Otherwise, we may accidentally delete existing data. +val tableType = if (external || location.isDefined) { + CatalogTableType.EXTERNAL +} else { + CatalogTableType.MANAGED +} // TODO support the sql text - have a proper location for this! val tableDesc = CatalogTable( http://git-wip-us.apache.org/repos/asf/spark/blob/f14c4ba0/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala index fa8dabf..aeb613a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala @@ -227,6 +227,18 @@ class DDLCommandSuite extends PlanTest { } } + test("create table - location implies external") { +val query = "CREATE TABLE my_tab LOCATION '/something/anything'" +parser.parsePlan(query) match { + case ct: CreateTable => +assert(ct.table.tableType == CatalogTableType.EXTERNAL) +assert(ct.table.storage.locationUri == Some("/something/anything")) + case other => +fail(s"Expected to parse ${classOf[CreateTable].getClass.getName} from query," + +s"got ${other.getClass.getName}: $query") +} + } + // ALTER TABLE table_name RENAME TO new_table_name; // ALTER VIEW view_name RENAME TO new_view_name; test
spark git commit: [SPARK-15276][SQL] CREATE TABLE with LOCATION should imply EXTERNAL
Repository: spark Updated Branches: refs/heads/branch-2.0 f9ea54575 -> f763c1485 [SPARK-15276][SQL] CREATE TABLE with LOCATION should imply EXTERNAL ## What changes were proposed in this pull request? Before: ```sql -- uses that location but issues a warning CREATE TABLE my_tab LOCATION /some/path -- deletes any existing data in the specified location DROP TABLE my_tab ``` After: ```sql -- uses that location but creates an EXTERNAL table instead CREATE TABLE my_tab LOCATION /some/path -- does not delete the data at /some/path DROP TABLE my_tab ``` This patch essentially makes the `EXTERNAL` field optional. This is related to #13032. ## How was this patch tested? New test in `DDLCommandSuite`. Author: Andrew Or <and...@databricks.com> Closes #13060 from andrewor14/location-implies-external. (cherry picked from commit f14c4ba001fbdbcc9faa46896f1f9d08a7d06609) Signed-off-by: Andrew Or <and...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f763c148 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f763c148 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f763c148 Branch: refs/heads/branch-2.0 Commit: f763c14851f6e55e61de8ef79ae449a7257a547d Parents: f9ea545 Author: Andrew Or <and...@databricks.com> Authored: Wed May 11 17:29:58 2016 -0700 Committer: Andrew Or <and...@databricks.com> Committed: Wed May 11 17:30:06 2016 -0700 -- .../org/apache/spark/sql/execution/SparkSqlParser.scala | 12 +++- .../spark/sql/execution/command/DDLCommandSuite.scala | 12 .../apache/spark/sql/hive/execution/HiveDDLSuite.scala | 8 +++- .../apache/spark/sql/hive/execution/SQLQuerySuite.scala | 5 + 4 files changed, 23 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f763c148/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index a51665f..53aba1f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -745,11 +745,6 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { if (ctx.bucketSpec != null) { throw operationNotAllowed("CREATE TABLE ... CLUSTERED BY", ctx) } -val tableType = if (external) { - CatalogTableType.EXTERNAL -} else { - CatalogTableType.MANAGED -} val comment = Option(ctx.STRING).map(string) val partitionCols = Option(ctx.partitionColumns).toSeq.flatMap(visitCatalogColumns) val cols = Option(ctx.columns).toSeq.flatMap(visitCatalogColumns) @@ -791,6 +786,13 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { serde = rowStorage.serde.orElse(fileStorage.serde).orElse(defaultStorage.serde), compressed = false, serdeProperties = rowStorage.serdeProperties ++ fileStorage.serdeProperties) +// If location is defined, we'll assume this is an external table. +// Otherwise, we may accidentally delete existing data. +val tableType = if (external || location.isDefined) { + CatalogTableType.EXTERNAL +} else { + CatalogTableType.MANAGED +} // TODO support the sql text - have a proper location for this! val tableDesc = CatalogTable( http://git-wip-us.apache.org/repos/asf/spark/blob/f763c148/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala index fa8dabf..aeb613a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala @@ -227,6 +227,18 @@ class DDLCommandSuite extends PlanTest { } } + test("create table - location implies external") { +val query = "CREATE TABLE my_tab LOCATION '/something/anything'" +parser.parsePlan(query) match { + case ct: CreateTable => +assert(ct.table.tableType == CatalogTableType.EXTERNAL) +assert(ct.table.storage.locationUri == Some("/something/anything")) + case other => +fail(s"Expected to parse ${classOf[CreateTable].getClass.getName} from query," + +s"got ${other.getClass.getName}: $query
spark git commit: [SPARK-15264][SPARK-15274][SQL] CSV Reader Error on Blank Column Names
Repository: spark Updated Branches: refs/heads/master f14c4ba00 -> 603f4453a [SPARK-15264][SPARK-15274][SQL] CSV Reader Error on Blank Column Names ## What changes were proposed in this pull request? When a CSV begins with: - `,,` OR - `"","",` meaning that the first column names are either empty or blank strings and `header` is specified to be `true`, then the column name is replaced with `C` + the index number of that given column. For example, if you were to read in the CSV: ``` "","second column" "hello", "there" ``` Then column names would become `"C0", "second column"`. This behavior aligns with what currently happens when `header` is specified to be `false` in recent versions of Spark. ### Current Behavior in Spark <=1.6 In Spark <=1.6, a CSV with a blank column name becomes a blank string, `""`, meaning that this column cannot be accessed. However the CSV reads in without issue. ### Current Behavior in Spark 2.0 Spark throws a NullPointerError and will not read in the file. Reproduction in 2.0 https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/346304/2828750690305044/484361/latest.html ## How was this patch tested? A new test was added to `CSVSuite` to account for this issue. We then have asserts that test for being able to select both the empty column names as well as the regular column names. Author: Bill ChambersAuthor: Bill Chambers Closes #13041 from anabranch/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/603f4453 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/603f4453 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/603f4453 Branch: refs/heads/master Commit: 603f4453a16825cc5773cfe24d6ae4cee5ec949a Parents: f14c4ba Author: Bill Chambers Authored: Wed May 11 17:42:13 2016 -0700 Committer: Andrew Or Committed: Wed May 11 17:42:13 2016 -0700 -- python/pyspark/sql/readwriter.py| 2 +- .../execution/datasources/csv/DefaultSource.scala | 6 -- .../src/test/resources/cars-blank-column-name.csv | 3 +++ .../sql/execution/datasources/csv/CSVSuite.scala| 16 ++-- 4 files changed, 22 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/603f4453/python/pyspark/sql/readwriter.py -- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 7fd7583..5cb1860 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -358,7 +358,7 @@ class DataFrameReader(object): >>> df = spark.read.csv('python/test_support/sql/ages.csv') >>> df.dtypes -[('C0', 'string'), ('C1', 'string')] +[('_c0', 'string'), ('_c1', 'string')] """ if schema is not None: self.schema(schema) http://git-wip-us.apache.org/repos/asf/spark/blob/603f4453/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/DefaultSource.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/DefaultSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/DefaultSource.scala index 948fac0..f47ed76 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/DefaultSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/DefaultSource.scala @@ -61,9 +61,11 @@ class DefaultSource extends FileFormat with DataSourceRegister { val firstRow = new LineCsvReader(csvOptions).parseLine(firstLine) val header = if (csvOptions.headerFlag) { - firstRow + firstRow.zipWithIndex.map { case (value, index) => +if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value + } } else { - firstRow.zipWithIndex.map { case (value, index) => s"C$index" } + firstRow.zipWithIndex.map { case (value, index) => s"_c$index" } } val parsedRdd = tokenRdd(sparkSession, csvOptions, header, paths) http://git-wip-us.apache.org/repos/asf/spark/blob/603f4453/sql/core/src/test/resources/cars-blank-column-name.csv -- diff --git a/sql/core/src/test/resources/cars-blank-column-name.csv b/sql/core/src/test/resources/cars-blank-column-name.csv new file mode 100644 index 000..0b804b1 --- /dev/null +++ b/sql/core/src/test/resources/cars-blank-column-name.csv @@ -0,0 +1,3 @@ +"",,make,customer,comment +2012,"Tesla","S","bill","blank"
spark git commit: [SPARK-15072][SQL][PYSPARK] FollowUp: Remove SparkSession.withHiveSupport in PySpark
Repository: spark Updated Branches: refs/heads/branch-2.0 f8804bb10 -> 114be703d [SPARK-15072][SQL][PYSPARK] FollowUp: Remove SparkSession.withHiveSupport in PySpark ## What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/12851 Remove `SparkSession.withHiveSupport` in PySpark and instead use `SparkSession.builder. enableHiveSupport` ## How was this patch tested? Existing tests. Author: Sandeep SinghCloses #13063 from techaddict/SPARK-15072-followup. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/114be703 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/114be703 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/114be703 Branch: refs/heads/branch-2.0 Commit: 114be703d5655b6456955e795e670cd62915b37e Parents: f8804bb Author: Sandeep Singh Authored: Wed May 11 17:44:00 2016 -0700 Committer: Andrew Or Committed: Wed May 11 17:44:37 2016 -0700 -- .../sbt_app_hive/src/main/scala/HiveApp.scala | 8 +--- python/pyspark/shell.py | 4 +++- python/pyspark/sql/session.py | 10 -- .../scala/org/apache/spark/sql/hive/HiveContext.scala | 2 +- 4 files changed, 9 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/114be703/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala -- diff --git a/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala b/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala index f69d46c..8cbfb9c 100644 --- a/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala +++ b/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala @@ -33,7 +33,9 @@ object SparkSqlExample { case None => new SparkConf().setAppName("Simple Sql App") } val sc = new SparkContext(conf) -val sparkSession = SparkSession.withHiveSupport(sc) +val sparkSession = SparkSession.builder + .enableHiveSupport() + .getOrCreate() import sparkSession._ sql("DROP TABLE IF EXISTS src") @@ -41,14 +43,14 @@ object SparkSqlExample { sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src") val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect() results.foreach(println) - + def test(f: => Boolean, failureMsg: String) = { if (!f) { println(failureMsg) System.exit(-1) } } - + test(results.size == 5, "Unexpected number of selected elements: " + results) println("Test succeeded") sc.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/114be703/python/pyspark/shell.py -- diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py index c6b0eda..adaa3b5 100644 --- a/python/pyspark/shell.py +++ b/python/pyspark/shell.py @@ -41,7 +41,9 @@ atexit.register(lambda: sc.stop()) try: # Try to access HiveConf, it will raise exception if Hive is not added sc._jvm.org.apache.hadoop.hive.conf.HiveConf() -spark = SparkSession.withHiveSupport(sc) +spark = SparkSession.builder\ +.enableHiveSupport()\ +.getOrCreate() except py4j.protocol.Py4JError: spark = SparkSession(sc) except TypeError: http://git-wip-us.apache.org/repos/asf/spark/blob/114be703/python/pyspark/sql/session.py -- diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 04842f6..4ee9ab8 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -182,16 +182,6 @@ class SparkSession(object): if SparkSession._instantiatedContext is None: SparkSession._instantiatedContext = self -@classmethod -@since(2.0) -def withHiveSupport(cls, sparkContext): -"""Returns a new SparkSession with a catalog backed by Hive. - -:param sparkContext: The underlying :class:`SparkContext`. -""" -jsparkSession = sparkContext._jvm.SparkSession.withHiveSupport(sparkContext._jsc.sc()) -return cls(sparkContext, jsparkSession) - @since(2.0) def newSession(self): """ http://git-wip-us.apache.org/repos/asf/spark/blob/114be703/sql/hivecontext-compatibility/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala -- diff --git a/sql/hivecontext-compatibility/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hivecontext-compatibility/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala index
spark git commit: [SPARK-14684][SPARK-15277][SQL] Partition Spec Validation in SessionCatalog and Checking Partition Spec Existence Before Dropping
Repository: spark Updated Branches: refs/heads/master 470de743e -> be617f3d0 [SPARK-14684][SPARK-15277][SQL] Partition Spec Validation in SessionCatalog and Checking Partition Spec Existence Before Dropping What changes were proposed in this pull request? ~~Currently, multiple partitions are allowed to drop by using a single DDL command: Alter Table Drop Partition. However, the internal implementation could break atomicity. That means, we could just drop a subset of qualified partitions, if hitting an exception when dropping one of qualified partitions~~ ~~This PR contains the following behavior changes:~~ ~~- disallow dropping multiple partitions by a single command ~~ ~~- allow users to input predicates in partition specification and issue a nicer error message if the predicate's comparison operator is not `=`.~~ ~~- verify the partition spec in SessionCatalog. This can ensure each partition spec in `Drop Partition` does not correspond to multiple partitions.~~ This PR has two major parts: - Verify the partition spec in SessionCatalog for fixing the following issue: ```scala sql(s"ALTER TABLE $externalTab DROP PARTITION (ds='2008-04-09', unknownCol='12')") ``` Above example uses an invalid partition spec. Without this PR, we will drop all the partitions. The reason is Hive megastores getPartitions API returns all the partitions if we provide an invalid spec. - Re-implemented the `dropPartitions` in `HiveClientImpl`. Now, we always check if all the user-specified partition specs exist before attempting to drop the partitions. Previously, we start drop the partition before completing checking the existence of all the partition specs. If any failure happened after we start to drop the partitions, we will log an error message to indicate which partitions have been dropped and which partitions have not been dropped. How was this patch tested? Modified the existing test cases and added new test cases. Author: gatorsmileAuthor: xiaoli Author: Xiao Li Closes #12801 from gatorsmile/banDropMultiPart. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be617f3d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be617f3d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be617f3d Branch: refs/heads/master Commit: be617f3d0695982f982006fdd79afe3e3730b4c4 Parents: 470de74 Author: gatorsmile Authored: Thu May 12 11:14:40 2016 -0700 Committer: Andrew Or Committed: Thu May 12 11:14:40 2016 -0700 -- .../sql/catalyst/catalog/SessionCatalog.scala | 47 +++- .../catalyst/catalog/ExternalCatalogSuite.scala | 6 + .../catalyst/catalog/SessionCatalogSuite.scala | 116 ++- .../spark/sql/execution/command/DDLSuite.scala | 78 ++--- .../spark/sql/hive/client/HiveClientImpl.scala | 50 +--- .../spark/sql/hive/execution/HiveDDLSuite.scala | 9 +- 6 files changed, 248 insertions(+), 58 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/be617f3d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 0fc4ab5..54b30d3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -510,6 +510,7 @@ class SessionCatalog( tableName: TableIdentifier, parts: Seq[CatalogTablePartition], ignoreIfExists: Boolean): Unit = { +requireExactMatchedPartitionSpec(parts.map(_.spec), getTableMetadata(tableName)) val db = formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase)) val table = formatTableName(tableName.table) requireDbExists(db) @@ -523,13 +524,14 @@ class SessionCatalog( */ def dropPartitions( tableName: TableIdentifier, - parts: Seq[TablePartitionSpec], + specs: Seq[TablePartitionSpec], ignoreIfNotExists: Boolean): Unit = { +requirePartialMatchedPartitionSpec(specs, getTableMetadata(tableName)) val db = formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase)) val table = formatTableName(tableName.table) requireDbExists(db) requireTableExists(TableIdentifier(table, Option(db))) -externalCatalog.dropPartitions(db, table, parts, ignoreIfNotExists) +externalCatalog.dropPartitions(db, table, specs, ignoreIfNotExists) } /** @@ -542,6 +544,9
spark git commit: [SPARK-14684][SPARK-15277][SQL] Partition Spec Validation in SessionCatalog and Checking Partition Spec Existence Before Dropping
Repository: spark Updated Branches: refs/heads/branch-2.0 68617e1ad -> 9c5c9013d [SPARK-14684][SPARK-15277][SQL] Partition Spec Validation in SessionCatalog and Checking Partition Spec Existence Before Dropping What changes were proposed in this pull request? ~~Currently, multiple partitions are allowed to drop by using a single DDL command: Alter Table Drop Partition. However, the internal implementation could break atomicity. That means, we could just drop a subset of qualified partitions, if hitting an exception when dropping one of qualified partitions~~ ~~This PR contains the following behavior changes:~~ ~~- disallow dropping multiple partitions by a single command ~~ ~~- allow users to input predicates in partition specification and issue a nicer error message if the predicate's comparison operator is not `=`.~~ ~~- verify the partition spec in SessionCatalog. This can ensure each partition spec in `Drop Partition` does not correspond to multiple partitions.~~ This PR has two major parts: - Verify the partition spec in SessionCatalog for fixing the following issue: ```scala sql(s"ALTER TABLE $externalTab DROP PARTITION (ds='2008-04-09', unknownCol='12')") ``` Above example uses an invalid partition spec. Without this PR, we will drop all the partitions. The reason is Hive megastores getPartitions API returns all the partitions if we provide an invalid spec. - Re-implemented the `dropPartitions` in `HiveClientImpl`. Now, we always check if all the user-specified partition specs exist before attempting to drop the partitions. Previously, we start drop the partition before completing checking the existence of all the partition specs. If any failure happened after we start to drop the partitions, we will log an error message to indicate which partitions have been dropped and which partitions have not been dropped. How was this patch tested? Modified the existing test cases and added new test cases. Author: gatorsmileAuthor: xiaoli Author: Xiao Li Closes #12801 from gatorsmile/banDropMultiPart. (cherry picked from commit be617f3d0695982f982006fdd79afe3e3730b4c4) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9c5c9013 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9c5c9013 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9c5c9013 Branch: refs/heads/branch-2.0 Commit: 9c5c9013de1311b3175a6156fb90447f00c7a883 Parents: 68617e1 Author: gatorsmile Authored: Thu May 12 11:14:40 2016 -0700 Committer: Andrew Or Committed: Thu May 12 11:14:52 2016 -0700 -- .../sql/catalyst/catalog/SessionCatalog.scala | 47 +++- .../catalyst/catalog/ExternalCatalogSuite.scala | 6 + .../catalyst/catalog/SessionCatalogSuite.scala | 116 ++- .../spark/sql/execution/command/DDLSuite.scala | 78 ++--- .../spark/sql/hive/client/HiveClientImpl.scala | 50 +--- .../spark/sql/hive/execution/HiveDDLSuite.scala | 9 +- 6 files changed, 248 insertions(+), 58 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9c5c9013/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 0fc4ab5..54b30d3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -510,6 +510,7 @@ class SessionCatalog( tableName: TableIdentifier, parts: Seq[CatalogTablePartition], ignoreIfExists: Boolean): Unit = { +requireExactMatchedPartitionSpec(parts.map(_.spec), getTableMetadata(tableName)) val db = formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase)) val table = formatTableName(tableName.table) requireDbExists(db) @@ -523,13 +524,14 @@ class SessionCatalog( */ def dropPartitions( tableName: TableIdentifier, - parts: Seq[TablePartitionSpec], + specs: Seq[TablePartitionSpec], ignoreIfNotExists: Boolean): Unit = { +requirePartialMatchedPartitionSpec(specs, getTableMetadata(tableName)) val db = formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase)) val table = formatTableName(tableName.table) requireDbExists(db) requireTableExists(TableIdentifier(table, Option(db))) -externalCatalog.dropPartitions(db, table,
spark git commit: [SPARK-15031][SPARK-15134][EXAMPLE][DOC] Use SparkSession and update indent in examples
Repository: spark Updated Branches: refs/heads/branch-2.0 7d187539e -> 86acb5efd [SPARK-15031][SPARK-15134][EXAMPLE][DOC] Use SparkSession and update indent in examples ## What changes were proposed in this pull request? 1, Use `SparkSession` according to [SPARK-15031](https://issues.apache.org/jira/browse/SPARK-15031) 2, Update indent for `SparkContext` according to [SPARK-15134](https://issues.apache.org/jira/browse/SPARK-15134) 3, BTW, remove some duplicate space and add missing '.' ## How was this patch tested? manual tests Author: Zheng RuiFengCloses #13050 from zhengruifeng/use_sparksession. (cherry picked from commit 9e266d07a444fd465fe178cdd5c4894cd09cbda3) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/86acb5ef Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/86acb5ef Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/86acb5ef Branch: refs/heads/branch-2.0 Commit: 86acb5efdbc52820f89c039edac61c0454709f4c Parents: 7d18753 Author: Zheng RuiFeng Authored: Wed May 11 22:45:30 2016 -0700 Committer: Andrew Or Committed: Wed May 11 22:45:41 2016 -0700 -- .../JavaDecisionTreeClassificationExample.java | 14 --- .../ml/JavaDecisionTreeRegressionExample.java | 12 +++--- .../examples/ml/JavaDeveloperApiExample.java| 6 +-- .../JavaEstimatorTransformerParamExample.java | 4 +- ...avaGradientBoostedTreeClassifierExample.java | 6 +-- ...JavaGradientBoostedTreeRegressorExample.java | 12 +++--- ...vaLinearRegressionWithElasticNetExample.java | 12 +++--- .../JavaLogisticRegressionSummaryExample.java | 4 +- ...LogisticRegressionWithElasticNetExample.java | 4 +- ...ModelSelectionViaCrossValidationExample.java | 4 +- ...SelectionViaTrainValidationSplitExample.java | 4 +- ...vaMultilayerPerceptronClassifierExample.java | 4 +- .../ml/JavaQuantileDiscretizerExample.java | 4 +- .../ml/JavaRandomForestClassifierExample.java | 4 +- .../ml/JavaRandomForestRegressorExample.java| 6 ++- .../examples/ml/JavaSimpleParamsExample.java| 8 ++-- .../JavaSimpleTextClassificationPipeline.java | 4 +- .../ml/DecisionTreeClassificationExample.scala | 10 ++--- .../spark/examples/ml/DecisionTreeExample.scala | 39 ++-- .../ml/DecisionTreeRegressionExample.scala | 8 ++-- .../spark/examples/ml/DeveloperApiExample.scala | 14 +++ .../ml/EstimatorTransformerParamExample.scala | 8 ++-- .../apache/spark/examples/ml/GBTExample.scala | 30 --- .../GradientBoostedTreeClassifierExample.scala | 8 ++-- .../GradientBoostedTreeRegressorExample.scala | 8 ++-- .../examples/ml/LinearRegressionExample.scala | 17 + .../examples/ml/LogisticRegressionExample.scala | 21 ++- ...ogisticRegressionWithElasticNetExample.scala | 4 +- ...odelSelectionViaCrossValidationExample.scala | 4 +- ...electionViaTrainValidationSplitExample.scala | 4 +- .../ml/RandomForestClassifierExample.scala | 8 ++-- .../spark/examples/ml/RandomForestExample.scala | 32 .../ml/RandomForestRegressorExample.scala | 8 ++-- .../spark/examples/ml/SimpleParamsExample.scala | 8 ++-- 34 files changed, 192 insertions(+), 151 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/86acb5ef/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java index 733bc41..bdb76f0 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java @@ -32,7 +32,9 @@ import org.apache.spark.sql.SparkSession; public class JavaDecisionTreeClassificationExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaDecisionTreeClassificationExample").getOrCreate(); + .builder() + .appName("JavaDecisionTreeClassificationExample") + .getOrCreate(); // $example on$ // Load the data stored in LIBSVM format as a DataFrame. @@ -52,10 +54,10 @@ public class JavaDecisionTreeClassificationExample { VectorIndexerModel featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") - .setMaxCategories(4) // features with > 4 distinct values are treated as continuous + .setMaxCategories(4) //
spark git commit: [SPARK-15031][SPARK-15134][EXAMPLE][DOC] Use SparkSession and update indent in examples
Repository: spark Updated Branches: refs/heads/master ba5487c06 -> 9e266d07a [SPARK-15031][SPARK-15134][EXAMPLE][DOC] Use SparkSession and update indent in examples ## What changes were proposed in this pull request? 1, Use `SparkSession` according to [SPARK-15031](https://issues.apache.org/jira/browse/SPARK-15031) 2, Update indent for `SparkContext` according to [SPARK-15134](https://issues.apache.org/jira/browse/SPARK-15134) 3, BTW, remove some duplicate space and add missing '.' ## How was this patch tested? manual tests Author: Zheng RuiFengCloses #13050 from zhengruifeng/use_sparksession. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9e266d07 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9e266d07 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9e266d07 Branch: refs/heads/master Commit: 9e266d07a444fd465fe178cdd5c4894cd09cbda3 Parents: ba5487c Author: Zheng RuiFeng Authored: Wed May 11 22:45:30 2016 -0700 Committer: Andrew Or Committed: Wed May 11 22:45:30 2016 -0700 -- .../JavaDecisionTreeClassificationExample.java | 14 --- .../ml/JavaDecisionTreeRegressionExample.java | 12 +++--- .../examples/ml/JavaDeveloperApiExample.java| 6 +-- .../JavaEstimatorTransformerParamExample.java | 4 +- ...avaGradientBoostedTreeClassifierExample.java | 6 +-- ...JavaGradientBoostedTreeRegressorExample.java | 12 +++--- ...vaLinearRegressionWithElasticNetExample.java | 12 +++--- .../JavaLogisticRegressionSummaryExample.java | 4 +- ...LogisticRegressionWithElasticNetExample.java | 4 +- ...ModelSelectionViaCrossValidationExample.java | 4 +- ...SelectionViaTrainValidationSplitExample.java | 4 +- ...vaMultilayerPerceptronClassifierExample.java | 4 +- .../ml/JavaQuantileDiscretizerExample.java | 4 +- .../ml/JavaRandomForestClassifierExample.java | 4 +- .../ml/JavaRandomForestRegressorExample.java| 6 ++- .../examples/ml/JavaSimpleParamsExample.java| 8 ++-- .../JavaSimpleTextClassificationPipeline.java | 4 +- .../ml/DecisionTreeClassificationExample.scala | 10 ++--- .../spark/examples/ml/DecisionTreeExample.scala | 39 ++-- .../ml/DecisionTreeRegressionExample.scala | 8 ++-- .../spark/examples/ml/DeveloperApiExample.scala | 14 +++ .../ml/EstimatorTransformerParamExample.scala | 8 ++-- .../apache/spark/examples/ml/GBTExample.scala | 30 --- .../GradientBoostedTreeClassifierExample.scala | 8 ++-- .../GradientBoostedTreeRegressorExample.scala | 8 ++-- .../examples/ml/LinearRegressionExample.scala | 17 + .../examples/ml/LogisticRegressionExample.scala | 21 ++- ...ogisticRegressionWithElasticNetExample.scala | 4 +- ...odelSelectionViaCrossValidationExample.scala | 4 +- ...electionViaTrainValidationSplitExample.scala | 4 +- .../ml/RandomForestClassifierExample.scala | 8 ++-- .../spark/examples/ml/RandomForestExample.scala | 32 .../ml/RandomForestRegressorExample.scala | 8 ++-- .../spark/examples/ml/SimpleParamsExample.scala | 8 ++-- 34 files changed, 192 insertions(+), 151 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9e266d07/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java index 733bc41..bdb76f0 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java @@ -32,7 +32,9 @@ import org.apache.spark.sql.SparkSession; public class JavaDecisionTreeClassificationExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaDecisionTreeClassificationExample").getOrCreate(); + .builder() + .appName("JavaDecisionTreeClassificationExample") + .getOrCreate(); // $example on$ // Load the data stored in LIBSVM format as a DataFrame. @@ -52,10 +54,10 @@ public class JavaDecisionTreeClassificationExample { VectorIndexerModel featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") - .setMaxCategories(4) // features with > 4 distinct values are treated as continuous + .setMaxCategories(4) // features with > 4 distinct values are treated as continuous. .fit(data); -// Split the data into training and test sets
spark git commit: [SPARK-14422][SQL] Improve handling of optional configs in SQLConf
Repository: spark Updated Branches: refs/heads/branch-2.0 0d16b7f3a -> 5625b037a [SPARK-14422][SQL] Improve handling of optional configs in SQLConf ## What changes were proposed in this pull request? Create a new API for handling Optional Configs in SQLConf. Right now `getConf` for `OptionalConfigEntry[T]` returns value of type `T`, if doesn't exist throws an exception. Add new method `getOptionalConf`(suggestions on naming) which will now returns value of type `Option[T]`(so if doesn't exist it returns `None`). ## How was this patch tested? Add test and ran tests locally. Author: Sandeep SinghCloses #12846 from techaddict/SPARK-14422. (cherry picked from commit a8d56f538878443da6eae69449858ad4e2274151) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5625b037 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5625b037 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5625b037 Branch: refs/heads/branch-2.0 Commit: 5625b037a0c952b97e1afa6a3113c0847ade Parents: 0d16b7f Author: Sandeep Singh Authored: Tue May 3 18:02:57 2016 -0700 Committer: Andrew Or Committed: Tue May 3 18:03:05 2016 -0700 -- .../scala/org/apache/spark/sql/DataFrameWriter.scala | 9 + .../main/scala/org/apache/spark/sql/RuntimeConfig.scala | 6 +- .../scala/org/apache/spark/sql/internal/SQLConf.scala| 9 - .../apache/spark/sql/internal/SQLConfEntrySuite.scala| 11 +++ 4 files changed, 25 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5625b037/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index a8f96a9..0793b62 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -296,7 +296,7 @@ final class DataFrameWriter private[sql](df: DataFrame) { new Path(userSpecified).toUri.toString }.orElse { val checkpointConfig: Option[String] = - df.sparkSession.conf.get(SQLConf.CHECKPOINT_LOCATION, None) + df.sparkSession.conf.get(SQLConf.CHECKPOINT_LOCATION) checkpointConfig.map { location => new Path(location, queryName).toUri.toString @@ -334,9 +334,10 @@ final class DataFrameWriter private[sql](df: DataFrame) { partitionColumns = normalizedParCols.getOrElse(Nil)) val queryName = extraOptions.getOrElse("queryName", StreamExecution.nextName) - val checkpointLocation = extraOptions.getOrElse("checkpointLocation", { -new Path(df.sparkSession.sessionState.conf.checkpointLocation, queryName).toUri.toString - }) + val checkpointLocation = extraOptions.getOrElse("checkpointLocation", +new Path(df.sparkSession.sessionState.conf.checkpointLocation.get, queryName).toUri.toString + ) + df.sparkSession.sessionState.continuousQueryManager.startQuery( queryName, checkpointLocation, http://git-wip-us.apache.org/repos/asf/spark/blob/5625b037/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala index 670288b..4fd6e42 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import org.apache.spark.internal.config.ConfigEntry +import org.apache.spark.internal.config.{ConfigEntry, OptionalConfigEntry} import org.apache.spark.sql.internal.SQLConf @@ -86,6 +86,10 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { sqlConf.getConf(entry) } + protected[sql] def get[T](entry: OptionalConfigEntry[T]): Option[T] = { +sqlConf.getConf(entry) + } + /** * Returns the value of Spark runtime configuration property for the given key. */ http://git-wip-us.apache.org/repos/asf/spark/blob/5625b037/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 0bcf0f8..5e19984 100644 ---
spark git commit: [SPARK-14645][MESOS] Fix python running on cluster mode mesos to have non local uris
Repository: spark Updated Branches: refs/heads/branch-2.0 5625b037a -> 4c7f5a74d [SPARK-14645][MESOS] Fix python running on cluster mode mesos to have non local uris ## What changes were proposed in this pull request? Fix SparkSubmit to allow non-local python uris ## How was this patch tested? Manually tested with mesos-spark-dispatcher Author: Timothy ChenCloses #12403 from tnachen/enable_remote_python. (cherry picked from commit c1839c9911e37488230a68dec9041eb5958b6f1c) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4c7f5a74 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4c7f5a74 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4c7f5a74 Branch: refs/heads/branch-2.0 Commit: 4c7f5a74d702c3bed0b07cfa498cbe7e653fce03 Parents: 5625b03 Author: Timothy Chen Authored: Tue May 3 18:04:04 2016 -0700 Committer: Andrew Or Committed: Tue May 3 18:04:15 2016 -0700 -- core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4c7f5a74/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 926e1ff..755c4b6 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -305,8 +305,9 @@ object SparkSubmit { } // Require all python files to be local, so we can add them to the PYTHONPATH -// In YARN cluster mode, python files are distributed as regular files, which can be non-local -if (args.isPython && !isYarnCluster) { +// In YARN cluster mode, python files are distributed as regular files, which can be non-local. +// In Mesos cluster mode, non-local python files are automatically downloaded by Mesos. +if (args.isPython && !isYarnCluster && !isMesosCluster) { if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) { printErrorAndExit(s"Only local python files are supported: $args.primaryResource") } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14645][MESOS] Fix python running on cluster mode mesos to have non local uris
Repository: spark Updated Branches: refs/heads/master a8d56f538 -> c1839c991 [SPARK-14645][MESOS] Fix python running on cluster mode mesos to have non local uris ## What changes were proposed in this pull request? Fix SparkSubmit to allow non-local python uris ## How was this patch tested? Manually tested with mesos-spark-dispatcher Author: Timothy ChenCloses #12403 from tnachen/enable_remote_python. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c1839c99 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c1839c99 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c1839c99 Branch: refs/heads/master Commit: c1839c9911e37488230a68dec9041eb5958b6f1c Parents: a8d56f5 Author: Timothy Chen Authored: Tue May 3 18:04:04 2016 -0700 Committer: Andrew Or Committed: Tue May 3 18:04:04 2016 -0700 -- core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c1839c99/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 926e1ff..755c4b6 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -305,8 +305,9 @@ object SparkSubmit { } // Require all python files to be local, so we can add them to the PYTHONPATH -// In YARN cluster mode, python files are distributed as regular files, which can be non-local -if (args.isPython && !isYarnCluster) { +// In YARN cluster mode, python files are distributed as regular files, which can be non-local. +// In Mesos cluster mode, non-local python files are automatically downloaded by Mesos. +if (args.isPython && !isYarnCluster && !isMesosCluster) { if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) { printErrorAndExit(s"Only local python files are supported: $args.primaryResource") } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15097][SQL] make Dataset.sqlContext a stable identifier for imports
Repository: spark Updated Branches: refs/heads/master 0903a185c -> 9e4928b7e [SPARK-15097][SQL] make Dataset.sqlContext a stable identifier for imports ## What changes were proposed in this pull request? Make Dataset.sqlContext a lazy val so that its a stable identifier and can be used for imports. Now this works again: import someDataset.sqlContext.implicits._ ## How was this patch tested? Add unit test to DatasetSuite that uses the import show above. Author: Koert KuipersCloses #12877 from koertkuipers/feat-sqlcontext-stable-import. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9e4928b7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9e4928b7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9e4928b7 Branch: refs/heads/master Commit: 9e4928b7e00788913553c1cb0722048001b91601 Parents: 0903a18 Author: Koert Kuipers Authored: Tue May 3 18:06:35 2016 -0700 Committer: Andrew Or Committed: Tue May 3 18:06:35 2016 -0700 -- .../src/main/scala/org/apache/spark/sql/Dataset.scala | 3 ++- .../test/scala/org/apache/spark/sql/DatasetSuite.scala | 13 + 2 files changed, 15 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9e4928b7/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 08be94e..1bea72c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -211,7 +211,8 @@ class Dataset[T] private[sql]( private implicit def classTag = unresolvedTEncoder.clsTag - def sqlContext: SQLContext = sparkSession.wrapped + // sqlContext must be val because a stable identifier is expected when you import implicits + @transient lazy val sqlContext: SQLContext = sparkSession.wrapped protected[sql] def resolve(colName: String): NamedExpression = { queryExecution.analyzed.resolveQuoted(colName, sparkSession.sessionState.analyzer.resolver) http://git-wip-us.apache.org/repos/asf/spark/blob/9e4928b7/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index eee21ac..68a12b0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -653,6 +653,11 @@ class DatasetSuite extends QueryTest with SharedSQLContext { dataset.join(actual, dataset("user") === actual("id")).collect() } + + test("SPARK-15097: implicits on dataset's sqlContext can be imported") { +val dataset = Seq(1, 2, 3).toDS() +checkDataset(DatasetTransform.addOne(dataset), 2, 3, 4) + } } case class OtherTuple(_1: String, _2: Int) @@ -713,3 +718,11 @@ class JavaData(val a: Int) extends Serializable { object JavaData { def apply(a: Int): JavaData = new JavaData(a) } + +/** Used to test importing dataset.sqlContext.implicits._ */ +object DatasetTransform { + def addOne(ds: Dataset[Int]): Dataset[Int] = { +import ds.sqlContext.implicits._ +ds.map(_ + 1) + } +} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15097][SQL] make Dataset.sqlContext a stable identifier for imports
Repository: spark Updated Branches: refs/heads/branch-2.0 5e15615d1 -> 95d359abd [SPARK-15097][SQL] make Dataset.sqlContext a stable identifier for imports ## What changes were proposed in this pull request? Make Dataset.sqlContext a lazy val so that its a stable identifier and can be used for imports. Now this works again: import someDataset.sqlContext.implicits._ ## How was this patch tested? Add unit test to DatasetSuite that uses the import show above. Author: Koert KuipersCloses #12877 from koertkuipers/feat-sqlcontext-stable-import. (cherry picked from commit 9e4928b7e00788913553c1cb0722048001b91601) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/95d359ab Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/95d359ab Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/95d359ab Branch: refs/heads/branch-2.0 Commit: 95d359abdd852c5f8da2d1ee982a1e0df0963868 Parents: 5e15615 Author: Koert Kuipers Authored: Tue May 3 18:06:35 2016 -0700 Committer: Andrew Or Committed: Tue May 3 18:06:43 2016 -0700 -- .../src/main/scala/org/apache/spark/sql/Dataset.scala | 3 ++- .../test/scala/org/apache/spark/sql/DatasetSuite.scala | 13 + 2 files changed, 15 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/95d359ab/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 08be94e..1bea72c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -211,7 +211,8 @@ class Dataset[T] private[sql]( private implicit def classTag = unresolvedTEncoder.clsTag - def sqlContext: SQLContext = sparkSession.wrapped + // sqlContext must be val because a stable identifier is expected when you import implicits + @transient lazy val sqlContext: SQLContext = sparkSession.wrapped protected[sql] def resolve(colName: String): NamedExpression = { queryExecution.analyzed.resolveQuoted(colName, sparkSession.sessionState.analyzer.resolver) http://git-wip-us.apache.org/repos/asf/spark/blob/95d359ab/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index eee21ac..68a12b0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -653,6 +653,11 @@ class DatasetSuite extends QueryTest with SharedSQLContext { dataset.join(actual, dataset("user") === actual("id")).collect() } + + test("SPARK-15097: implicits on dataset's sqlContext can be imported") { +val dataset = Seq(1, 2, 3).toDS() +checkDataset(DatasetTransform.addOne(dataset), 2, 3, 4) + } } case class OtherTuple(_1: String, _2: Int) @@ -713,3 +718,11 @@ class JavaData(val a: Int) extends Serializable { object JavaData { def apply(a: Int): JavaData = new JavaData(a) } + +/** Used to test importing dataset.sqlContext.implicits._ */ +object DatasetTransform { + def addOne(ds: Dataset[Int]): Dataset[Int] = { +import ds.sqlContext.implicits._ +ds.map(_ + 1) + } +} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15084][PYTHON][SQL] Use builder pattern to create SparkSession in PySpark.
Repository: spark Updated Branches: refs/heads/branch-2.0 4c7f5a74d -> 5e15615d1 [SPARK-15084][PYTHON][SQL] Use builder pattern to create SparkSession in PySpark. ## What changes were proposed in this pull request? This is a python port of corresponding Scala builder pattern code. `sql.py` is modified as a target example case. ## How was this patch tested? Manual. Author: Dongjoon HyunCloses #12860 from dongjoon-hyun/SPARK-15084. (cherry picked from commit 0903a185c7ebc57c75301a27d215b08efd347f99) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5e15615d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5e15615d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5e15615d Branch: refs/heads/branch-2.0 Commit: 5e15615d1258db8c31a8c0c9f9f33965bc14a910 Parents: 4c7f5a7 Author: Dongjoon Hyun Authored: Tue May 3 18:05:40 2016 -0700 Committer: Andrew Or Committed: Tue May 3 18:05:49 2016 -0700 -- examples/src/main/python/sql.py | 35 ++ python/pyspark/sql/session.py | 91 +++- 2 files changed, 105 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5e15615d/examples/src/main/python/sql.py -- diff --git a/examples/src/main/python/sql.py b/examples/src/main/python/sql.py index 2c18875..ea6a22d 100644 --- a/examples/src/main/python/sql.py +++ b/examples/src/main/python/sql.py @@ -20,33 +20,28 @@ from __future__ import print_function import os import sys -from pyspark import SparkContext -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType if __name__ == "__main__": -sc = SparkContext(appName="PythonSQL") -sqlContext = SQLContext(sc) +spark = SparkSession.builder.appName("PythonSQL").getOrCreate() -# RDD is created from a list of rows -some_rdd = sc.parallelize([Row(name="John", age=19), - Row(name="Smith", age=23), - Row(name="Sarah", age=18)]) -# Infer schema from the first row, create a DataFrame and print the schema -some_df = sqlContext.createDataFrame(some_rdd) +# A list of Rows. Infer schema from the first row, create a DataFrame and print the schema +rows = [Row(name="John", age=19), Row(name="Smith", age=23), Row(name="Sarah", age=18)] +some_df = spark.createDataFrame(rows) some_df.printSchema() -# Another RDD is created from a list of tuples -another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)]) +# A list of tuples +tuples = [("John", 19), ("Smith", 23), ("Sarah", 18)] # Schema with two fields - person_name and person_age schema = StructType([StructField("person_name", StringType(), False), StructField("person_age", IntegerType(), False)]) # Create a DataFrame by applying the schema to the RDD and print the schema -another_df = sqlContext.createDataFrame(another_rdd, schema) +another_df = spark.createDataFrame(tuples, schema) another_df.printSchema() # root -# |-- age: integer (nullable = true) +# |-- age: long (nullable = true) # |-- name: string (nullable = true) # A JSON dataset is pointed to by path. @@ -57,7 +52,7 @@ if __name__ == "__main__": else: path = sys.argv[1] # Create a DataFrame from the file(s) pointed to by path -people = sqlContext.jsonFile(path) +people = spark.read.json(path) # root # |-- person_name: string (nullable = false) # |-- person_age: integer (nullable = false) @@ -65,16 +60,16 @@ if __name__ == "__main__": # The inferred schema can be visualized using the printSchema() method. people.printSchema() # root -# |-- age: IntegerType -# |-- name: StringType +# |-- age: long (nullable = true) +# |-- name: string (nullable = true) # Register this DataFrame as a table. -people.registerAsTable("people") +people.registerTempTable("people") # SQL statements can be run by using the sql methods provided by sqlContext -teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") +teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") for each in teenagers.collect(): print(each[0]) -sc.stop() +spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/5e15615d/python/pyspark/sql/session.py -- diff --git
spark git commit: [SPARK-15084][PYTHON][SQL] Use builder pattern to create SparkSession in PySpark.
Repository: spark Updated Branches: refs/heads/master c1839c991 -> 0903a185c [SPARK-15084][PYTHON][SQL] Use builder pattern to create SparkSession in PySpark. ## What changes were proposed in this pull request? This is a python port of corresponding Scala builder pattern code. `sql.py` is modified as a target example case. ## How was this patch tested? Manual. Author: Dongjoon HyunCloses #12860 from dongjoon-hyun/SPARK-15084. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0903a185 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0903a185 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0903a185 Branch: refs/heads/master Commit: 0903a185c7ebc57c75301a27d215b08efd347f99 Parents: c1839c9 Author: Dongjoon Hyun Authored: Tue May 3 18:05:40 2016 -0700 Committer: Andrew Or Committed: Tue May 3 18:05:40 2016 -0700 -- examples/src/main/python/sql.py | 35 ++ python/pyspark/sql/session.py | 91 +++- 2 files changed, 105 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0903a185/examples/src/main/python/sql.py -- diff --git a/examples/src/main/python/sql.py b/examples/src/main/python/sql.py index 2c18875..ea6a22d 100644 --- a/examples/src/main/python/sql.py +++ b/examples/src/main/python/sql.py @@ -20,33 +20,28 @@ from __future__ import print_function import os import sys -from pyspark import SparkContext -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType if __name__ == "__main__": -sc = SparkContext(appName="PythonSQL") -sqlContext = SQLContext(sc) +spark = SparkSession.builder.appName("PythonSQL").getOrCreate() -# RDD is created from a list of rows -some_rdd = sc.parallelize([Row(name="John", age=19), - Row(name="Smith", age=23), - Row(name="Sarah", age=18)]) -# Infer schema from the first row, create a DataFrame and print the schema -some_df = sqlContext.createDataFrame(some_rdd) +# A list of Rows. Infer schema from the first row, create a DataFrame and print the schema +rows = [Row(name="John", age=19), Row(name="Smith", age=23), Row(name="Sarah", age=18)] +some_df = spark.createDataFrame(rows) some_df.printSchema() -# Another RDD is created from a list of tuples -another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)]) +# A list of tuples +tuples = [("John", 19), ("Smith", 23), ("Sarah", 18)] # Schema with two fields - person_name and person_age schema = StructType([StructField("person_name", StringType(), False), StructField("person_age", IntegerType(), False)]) # Create a DataFrame by applying the schema to the RDD and print the schema -another_df = sqlContext.createDataFrame(another_rdd, schema) +another_df = spark.createDataFrame(tuples, schema) another_df.printSchema() # root -# |-- age: integer (nullable = true) +# |-- age: long (nullable = true) # |-- name: string (nullable = true) # A JSON dataset is pointed to by path. @@ -57,7 +52,7 @@ if __name__ == "__main__": else: path = sys.argv[1] # Create a DataFrame from the file(s) pointed to by path -people = sqlContext.jsonFile(path) +people = spark.read.json(path) # root # |-- person_name: string (nullable = false) # |-- person_age: integer (nullable = false) @@ -65,16 +60,16 @@ if __name__ == "__main__": # The inferred schema can be visualized using the printSchema() method. people.printSchema() # root -# |-- age: IntegerType -# |-- name: StringType +# |-- age: long (nullable = true) +# |-- name: string (nullable = true) # Register this DataFrame as a table. -people.registerAsTable("people") +people.registerTempTable("people") # SQL statements can be run by using the sql methods provided by sqlContext -teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") +teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") for each in teenagers.collect(): print(each[0]) -sc.stop() +spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/0903a185/python/pyspark/sql/session.py -- diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 35c36b4..fb3e318 100644 --- a/python/pyspark/sql/session.py +++
spark git commit: [MINOR][DOC] Fixed some python snippets in mllib data types documentation.
Repository: spark Updated Branches: refs/heads/branch-2.0 c212307b9 -> 0d16b7f3a [MINOR][DOC] Fixed some python snippets in mllib data types documentation. ## What changes were proposed in this pull request? Some python snippets is using scala imports and comments. ## How was this patch tested? Generated the docs locally with `SKIP_API=1 jekyll build` and viewed the changes in the browser. Author: Shuai LinCloses #12869 from lins05/fix-mllib-python-snippets. (cherry picked from commit c4e0fde876fff259308d1d58ab51ae2697ae31f1) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0d16b7f3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0d16b7f3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0d16b7f3 Branch: refs/heads/branch-2.0 Commit: 0d16b7f3a5fbb2eb047edca838bcbde9037227a3 Parents: c212307 Author: Shuai Lin Authored: Tue May 3 18:02:12 2016 -0700 Committer: Andrew Or Committed: Tue May 3 18:02:23 2016 -0700 -- docs/mllib-data-types.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0d16b7f3/docs/mllib-data-types.md -- diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md index 5e3ee47..2ffe0f1 100644 --- a/docs/mllib-data-types.md +++ b/docs/mllib-data-types.md @@ -314,12 +314,12 @@ matrices. Remember, local matrices in MLlib are stored in column-major order. Refer to the [`Matrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrix) and [`Matrices` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrices) for more details on the API. {% highlight python %} -import org.apache.spark.mllib.linalg.{Matrix, Matrices} +from pyspark.mllib.linalg import Matrix, Matrices -// Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) +# Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) -// Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) +# Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8]) {% endhighlight %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOC] Fixed some python snippets in mllib data types documentation.
Repository: spark Updated Branches: refs/heads/master dbacd9998 -> c4e0fde87 [MINOR][DOC] Fixed some python snippets in mllib data types documentation. ## What changes were proposed in this pull request? Some python snippets is using scala imports and comments. ## How was this patch tested? Generated the docs locally with `SKIP_API=1 jekyll build` and viewed the changes in the browser. Author: Shuai LinCloses #12869 from lins05/fix-mllib-python-snippets. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c4e0fde8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c4e0fde8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c4e0fde8 Branch: refs/heads/master Commit: c4e0fde876fff259308d1d58ab51ae2697ae31f1 Parents: dbacd99 Author: Shuai Lin Authored: Tue May 3 18:02:12 2016 -0700 Committer: Andrew Or Committed: Tue May 3 18:02:12 2016 -0700 -- docs/mllib-data-types.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c4e0fde8/docs/mllib-data-types.md -- diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md index 5e3ee47..2ffe0f1 100644 --- a/docs/mllib-data-types.md +++ b/docs/mllib-data-types.md @@ -314,12 +314,12 @@ matrices. Remember, local matrices in MLlib are stored in column-major order. Refer to the [`Matrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrix) and [`Matrices` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrices) for more details on the API. {% highlight python %} -import org.apache.spark.mllib.linalg.{Matrix, Matrices} +from pyspark.mllib.linalg import Matrix, Matrices -// Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) +# Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) -// Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) +# Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8]) {% endhighlight %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14414][SQL] Make DDL exceptions more consistent
Repository: spark Updated Branches: refs/heads/master 9e4928b7e -> 6ba17cd14 [SPARK-14414][SQL] Make DDL exceptions more consistent ## What changes were proposed in this pull request? Just a bunch of small tweaks on DDL exception messages. ## How was this patch tested? `DDLCommandSuite` et al. Author: Andrew Or <and...@databricks.com> Closes #12853 from andrewor14/make-exceptions-consistent. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6ba17cd1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6ba17cd1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6ba17cd1 Branch: refs/heads/master Commit: 6ba17cd147277a20a7fbb244c040e694de486c36 Parents: 9e4928b Author: Andrew Or <and...@databricks.com> Authored: Tue May 3 18:07:53 2016 -0700 Committer: Andrew Or <and...@databricks.com> Committed: Tue May 3 18:07:53 2016 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 26 +-- .../catalyst/analysis/NoSuchItemException.scala | 14 +- .../sql/catalyst/catalog/InMemoryCatalog.scala | 10 +- .../sql/catalyst/catalog/SessionCatalog.scala | 18 +- .../spark/sql/catalyst/parser/ParserUtils.scala | 2 +- .../catalyst/catalog/SessionCatalogSuite.scala | 6 +- .../spark/sql/execution/SparkSqlParser.scala| 152 +++- .../sql/execution/command/AnalyzeTable.scala| 6 +- .../spark/sql/execution/command/ddl.scala | 70 +--- .../spark/sql/execution/command/functions.scala | 10 +- .../spark/sql/execution/command/tables.scala| 39 ++--- .../spark/sql/execution/command/views.scala | 3 +- .../sql/execution/command/DDLCommandSuite.scala | 174 +++ .../sql/sources/CreateTableAsSelectSuite.scala | 13 +- .../hive/execution/HiveCompatibilitySuite.scala | 10 +- .../spark/sql/hive/HiveExternalCatalog.scala| 2 +- .../spark/sql/hive/client/HiveClientImpl.scala | 12 +- .../sql/hive/execution/HiveCommandSuite.scala | 4 +- .../sql/hive/execution/SQLQuerySuite.scala | 2 +- .../spark/sql/hive/execution/SQLViewSuite.scala | 3 +- 20 files changed, 141 insertions(+), 435 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6ba17cd1/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 3ab448d..273ad92 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -81,18 +81,8 @@ statement DROP (IF EXISTS)? partitionSpec (',' partitionSpec)* PURGE? #dropTablePartitions | ALTER VIEW tableIdentifier DROP (IF EXISTS)? partitionSpec (',' partitionSpec)* #dropTablePartitions -| ALTER TABLE tableIdentifier partitionSpec? -SET FILEFORMAT fileFormat #setTableFileFormat | ALTER TABLE tableIdentifier partitionSpec? SET locationSpec #setTableLocation -| ALTER TABLE tableIdentifier partitionSpec? -CHANGE COLUMN? oldName=identifier colType -(FIRST | AFTER after=identifier)? (CASCADE | RESTRICT)? #changeColumn -| ALTER TABLE tableIdentifier partitionSpec? -ADD COLUMNS '(' colTypeList ')' (CASCADE | RESTRICT)? #addColumns -| ALTER TABLE tableIdentifier partitionSpec? -REPLACE COLUMNS '(' colTypeList ')' (CASCADE | RESTRICT)? #replaceColumns -| DROP TABLE (IF EXISTS)? tableIdentifier PURGE? -(FOR METADATA? REPLICATION '(' STRING ')')? #dropTable +| DROP TABLE (IF EXISTS)? tableIdentifier PURGE? #dropTable | DROP VIEW (IF EXISTS)? tableIdentifier #dropTable | CREATE (OR REPLACE)? VIEW (IF NOT EXISTS)? tableIdentifier identifierCommentList? (COMMENT STRING)? @@ -170,6 +160,10 @@ unsupportedHiveNativeCommands | kw1=ALTER kw2=TABLE tableIdentifier kw3=TOUCH | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=COMPACT | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=CONCATENATE +| kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=SET kw4=FILEFORMAT +| kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=ADD kw4=COLUMNS +| kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=CHANGE kw4=COLUMNS? +| kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=REPLACE kw4=COLUMNS | kw1=START kw2=TRANSACTION | kw1=COMMIT | kw1=ROLLBACK @@ -647,9 +641,9 @@ nonReserved | NO | DATA
spark git commit: [SPARK-14414][SQL] Make DDL exceptions more consistent
Repository: spark Updated Branches: refs/heads/branch-2.0 95d359abd -> 940b8f60b [SPARK-14414][SQL] Make DDL exceptions more consistent ## What changes were proposed in this pull request? Just a bunch of small tweaks on DDL exception messages. ## How was this patch tested? `DDLCommandSuite` et al. Author: Andrew Or <and...@databricks.com> Closes #12853 from andrewor14/make-exceptions-consistent. (cherry picked from commit 6ba17cd147277a20a7fbb244c040e694de486c36) Signed-off-by: Andrew Or <and...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/940b8f60 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/940b8f60 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/940b8f60 Branch: refs/heads/branch-2.0 Commit: 940b8f60b90d0acf6910abfd368af25cefdf4ffa Parents: 95d359a Author: Andrew Or <and...@databricks.com> Authored: Tue May 3 18:07:53 2016 -0700 Committer: Andrew Or <and...@databricks.com> Committed: Tue May 3 18:08:03 2016 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 26 +-- .../catalyst/analysis/NoSuchItemException.scala | 14 +- .../sql/catalyst/catalog/InMemoryCatalog.scala | 10 +- .../sql/catalyst/catalog/SessionCatalog.scala | 18 +- .../spark/sql/catalyst/parser/ParserUtils.scala | 2 +- .../catalyst/catalog/SessionCatalogSuite.scala | 6 +- .../spark/sql/execution/SparkSqlParser.scala| 152 +++- .../sql/execution/command/AnalyzeTable.scala| 6 +- .../spark/sql/execution/command/ddl.scala | 70 +--- .../spark/sql/execution/command/functions.scala | 10 +- .../spark/sql/execution/command/tables.scala| 39 ++--- .../spark/sql/execution/command/views.scala | 3 +- .../sql/execution/command/DDLCommandSuite.scala | 174 +++ .../sql/sources/CreateTableAsSelectSuite.scala | 13 +- .../hive/execution/HiveCompatibilitySuite.scala | 10 +- .../spark/sql/hive/HiveExternalCatalog.scala| 2 +- .../spark/sql/hive/client/HiveClientImpl.scala | 12 +- .../sql/hive/execution/HiveCommandSuite.scala | 4 +- .../sql/hive/execution/SQLQuerySuite.scala | 2 +- .../spark/sql/hive/execution/SQLViewSuite.scala | 3 +- 20 files changed, 141 insertions(+), 435 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/940b8f60/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 3ab448d..273ad92 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -81,18 +81,8 @@ statement DROP (IF EXISTS)? partitionSpec (',' partitionSpec)* PURGE? #dropTablePartitions | ALTER VIEW tableIdentifier DROP (IF EXISTS)? partitionSpec (',' partitionSpec)* #dropTablePartitions -| ALTER TABLE tableIdentifier partitionSpec? -SET FILEFORMAT fileFormat #setTableFileFormat | ALTER TABLE tableIdentifier partitionSpec? SET locationSpec #setTableLocation -| ALTER TABLE tableIdentifier partitionSpec? -CHANGE COLUMN? oldName=identifier colType -(FIRST | AFTER after=identifier)? (CASCADE | RESTRICT)? #changeColumn -| ALTER TABLE tableIdentifier partitionSpec? -ADD COLUMNS '(' colTypeList ')' (CASCADE | RESTRICT)? #addColumns -| ALTER TABLE tableIdentifier partitionSpec? -REPLACE COLUMNS '(' colTypeList ')' (CASCADE | RESTRICT)? #replaceColumns -| DROP TABLE (IF EXISTS)? tableIdentifier PURGE? -(FOR METADATA? REPLICATION '(' STRING ')')? #dropTable +| DROP TABLE (IF EXISTS)? tableIdentifier PURGE? #dropTable | DROP VIEW (IF EXISTS)? tableIdentifier #dropTable | CREATE (OR REPLACE)? VIEW (IF NOT EXISTS)? tableIdentifier identifierCommentList? (COMMENT STRING)? @@ -170,6 +160,10 @@ unsupportedHiveNativeCommands | kw1=ALTER kw2=TABLE tableIdentifier kw3=TOUCH | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=COMPACT | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=CONCATENATE +| kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=SET kw4=FILEFORMAT +| kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=ADD kw4=COLUMNS +| kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=CHANGE kw4=COLUMNS? +| kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=REPLACE kw4=C
spark git commit: [MINOR][BUILD] Adds spark-warehouse/ to .gitignore
Repository: spark Updated Branches: refs/heads/branch-2.0 c2b100e50 -> b063d9b71 [MINOR][BUILD] Adds spark-warehouse/ to .gitignore ## What changes were proposed in this pull request? Adds spark-warehouse/ to `.gitignore`. ## How was this patch tested? N/A Author: Cheng LianCloses #12929 from liancheng/gitignore-spark-warehouse. (cherry picked from commit 63db2bd283a430971d85f2a7b06dac77723c56fa) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b063d9b7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b063d9b7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b063d9b7 Branch: refs/heads/branch-2.0 Commit: b063d9b713056e7b9ae3e048e56d41a4804d520f Parents: c2b100e Author: Cheng Lian Authored: Thu May 5 14:33:14 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:33:25 2016 -0700 -- .gitignore | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b063d9b7/.gitignore -- diff --git a/.gitignore b/.gitignore index 6e09e09..9f8cd0b 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ metastore/ metastore_db/ sql/hive-thriftserver/test_warehouses warehouse/ +spark-warehouse/ # For R session data .RData - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][BUILD] Adds spark-warehouse/ to .gitignore
Repository: spark Updated Branches: refs/heads/master 6fcc9 -> 63db2bd28 [MINOR][BUILD] Adds spark-warehouse/ to .gitignore ## What changes were proposed in this pull request? Adds spark-warehouse/ to `.gitignore`. ## How was this patch tested? N/A Author: Cheng LianCloses #12929 from liancheng/gitignore-spark-warehouse. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/63db2bd2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/63db2bd2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/63db2bd2 Branch: refs/heads/master Commit: 63db2bd283a430971d85f2a7b06dac77723c56fa Parents: 6fc Author: Cheng Lian Authored: Thu May 5 14:33:14 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:33:14 2016 -0700 -- .gitignore | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/63db2bd2/.gitignore -- diff --git a/.gitignore b/.gitignore index 6e09e09..9f8cd0b 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ metastore/ metastore_db/ sql/hive-thriftserver/test_warehouses warehouse/ +spark-warehouse/ # For R session data .RData - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15135][SQL] Make sure SparkSession thread safe
Repository: spark Updated Branches: refs/heads/branch-2.0 59fa480b6 -> e78b31b72 [SPARK-15135][SQL] Make sure SparkSession thread safe ## What changes were proposed in this pull request? Went through SparkSession and its members and fixed non-thread-safe classes used by SparkSession ## How was this patch tested? Existing unit tests Author: Shixiong ZhuCloses #12915 from zsxwing/spark-session-thread-safe. (cherry picked from commit bb9991dec5dd631b22a05e2e1b83b9082a845e8f) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e78b31b7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e78b31b7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e78b31b7 Branch: refs/heads/branch-2.0 Commit: e78b31b72d85ffcc596176a2c91b050f3a3ff3ee Parents: 59fa480 Author: Shixiong Zhu Authored: Thu May 5 14:36:47 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:36:57 2016 -0700 -- .../catalyst/analysis/FunctionRegistry.scala| 10 +- .../sql/catalyst/catalog/InMemoryCatalog.scala | 2 +- .../sql/catalyst/catalog/SessionCatalog.scala | 102 ++- .../apache/spark/sql/ExperimentalMethods.scala | 4 +- .../org/apache/spark/sql/SparkSession.scala | 7 +- .../apache/spark/sql/hive/test/TestHive.scala | 4 +- 6 files changed, 73 insertions(+), 56 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e78b31b7/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 1bada2c..ac05dd3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -28,7 +28,11 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.util.StringKeyHashMap -/** A catalog for looking up user defined functions, used by an [[Analyzer]]. */ +/** + * A catalog for looking up user defined functions, used by an [[Analyzer]]. + * + * Note: The implementation should be thread-safe to allow concurrent access. + */ trait FunctionRegistry { final def registerFunction(name: String, builder: FunctionBuilder): Unit = { @@ -62,7 +66,7 @@ trait FunctionRegistry { class SimpleFunctionRegistry extends FunctionRegistry { - private[sql] val functionBuilders = + protected val functionBuilders = StringKeyHashMap[(ExpressionInfo, FunctionBuilder)](caseSensitive = false) override def registerFunction( @@ -97,7 +101,7 @@ class SimpleFunctionRegistry extends FunctionRegistry { functionBuilders.remove(name).isDefined } - override def clear(): Unit = { + override def clear(): Unit = synchronized { functionBuilders.clear() } http://git-wip-us.apache.org/repos/asf/spark/blob/e78b31b7/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index 1d2ca28..c65f461 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -340,7 +340,7 @@ class InMemoryCatalog extends ExternalCatalog { catalog(db).functions(funcName) } - override def functionExists(db: String, funcName: String): Boolean = { + override def functionExists(db: String, funcName: String): Boolean = synchronized { requireDbExists(db) catalog(db).functions.contains(funcName) } http://git-wip-us.apache.org/repos/asf/spark/blob/e78b31b7/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index eff420e..7127707 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -17,6 +17,8 @@ package
spark git commit: [SPARK-15135][SQL] Make sure SparkSession thread safe
Repository: spark Updated Branches: refs/heads/master ed6f3f8a5 -> bb9991dec [SPARK-15135][SQL] Make sure SparkSession thread safe ## What changes were proposed in this pull request? Went through SparkSession and its members and fixed non-thread-safe classes used by SparkSession ## How was this patch tested? Existing unit tests Author: Shixiong ZhuCloses #12915 from zsxwing/spark-session-thread-safe. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bb9991de Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bb9991de Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bb9991de Branch: refs/heads/master Commit: bb9991dec5dd631b22a05e2e1b83b9082a845e8f Parents: ed6f3f8 Author: Shixiong Zhu Authored: Thu May 5 14:36:47 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:36:47 2016 -0700 -- .../catalyst/analysis/FunctionRegistry.scala| 10 +- .../sql/catalyst/catalog/InMemoryCatalog.scala | 2 +- .../sql/catalyst/catalog/SessionCatalog.scala | 102 ++- .../apache/spark/sql/ExperimentalMethods.scala | 4 +- .../org/apache/spark/sql/SparkSession.scala | 7 +- .../apache/spark/sql/hive/test/TestHive.scala | 4 +- 6 files changed, 73 insertions(+), 56 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bb9991de/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 1bada2c..ac05dd3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -28,7 +28,11 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.util.StringKeyHashMap -/** A catalog for looking up user defined functions, used by an [[Analyzer]]. */ +/** + * A catalog for looking up user defined functions, used by an [[Analyzer]]. + * + * Note: The implementation should be thread-safe to allow concurrent access. + */ trait FunctionRegistry { final def registerFunction(name: String, builder: FunctionBuilder): Unit = { @@ -62,7 +66,7 @@ trait FunctionRegistry { class SimpleFunctionRegistry extends FunctionRegistry { - private[sql] val functionBuilders = + protected val functionBuilders = StringKeyHashMap[(ExpressionInfo, FunctionBuilder)](caseSensitive = false) override def registerFunction( @@ -97,7 +101,7 @@ class SimpleFunctionRegistry extends FunctionRegistry { functionBuilders.remove(name).isDefined } - override def clear(): Unit = { + override def clear(): Unit = synchronized { functionBuilders.clear() } http://git-wip-us.apache.org/repos/asf/spark/blob/bb9991de/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index 1d2ca28..c65f461 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -340,7 +340,7 @@ class InMemoryCatalog extends ExternalCatalog { catalog(db).functions(funcName) } - override def functionExists(db: String, funcName: String): Boolean = { + override def functionExists(db: String, funcName: String): Boolean = synchronized { requireDbExists(db) catalog(db).functions.contains(funcName) } http://git-wip-us.apache.org/repos/asf/spark/blob/bb9991de/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index eff420e..7127707 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.catalog +import javax.annotation.concurrent.GuardedBy + import scala.collection.mutable import
[1/2] spark git commit: [SPARK-15134][EXAMPLE] Indent SparkSession builder patterns and update binary_classification_metrics_example.py
Repository: spark Updated Branches: refs/heads/master bb9991dec -> 2c170dd3d http://git-wip-us.apache.org/repos/asf/spark/blob/2c170dd3/examples/src/main/python/ml/vector_indexer_example.py -- diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py index 3cf5b8e..9b00e0f 100644 --- a/examples/src/main/python/ml/vector_indexer_example.py +++ b/examples/src/main/python/ml/vector_indexer_example.py @@ -23,7 +23,10 @@ from pyspark.ml.feature import VectorIndexer from pyspark.sql import SparkSession if __name__ == "__main__": -spark = SparkSession.builder.appName("VectorIndexerExample").getOrCreate() +spark = SparkSession\ +.builder\ +.appName("VectorIndexerExample")\ +.getOrCreate() # $example on$ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") http://git-wip-us.apache.org/repos/asf/spark/blob/2c170dd3/examples/src/main/python/ml/vector_slicer_example.py -- diff --git a/examples/src/main/python/ml/vector_slicer_example.py b/examples/src/main/python/ml/vector_slicer_example.py index 0531bcd..b833a89 100644 --- a/examples/src/main/python/ml/vector_slicer_example.py +++ b/examples/src/main/python/ml/vector_slicer_example.py @@ -25,7 +25,10 @@ from pyspark.sql.types import Row from pyspark.sql import SparkSession if __name__ == "__main__": -spark = SparkSession.builder.appName("VectorSlicerExample").getOrCreate() +spark = SparkSession\ +.builder\ +.appName("VectorSlicerExample")\ +.getOrCreate() # $example on$ df = spark.createDataFrame([ http://git-wip-us.apache.org/repos/asf/spark/blob/2c170dd3/examples/src/main/python/ml/word2vec_example.py -- diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py index 6766a7b..66500be 100644 --- a/examples/src/main/python/ml/word2vec_example.py +++ b/examples/src/main/python/ml/word2vec_example.py @@ -23,7 +23,10 @@ from pyspark.ml.feature import Word2Vec from pyspark.sql import SparkSession if __name__ == "__main__": -spark = SparkSession.builder.appName("Word2VecExample").getOrCreate() +spark = SparkSession\ +.builder\ +.appName("Word2VecExample")\ +.getOrCreate() # $example on$ # Input data: Each row is a bag of words from a sentence or document. http://git-wip-us.apache.org/repos/asf/spark/blob/2c170dd3/examples/src/main/python/mllib/binary_classification_metrics_example.py -- diff --git a/examples/src/main/python/mllib/binary_classification_metrics_example.py b/examples/src/main/python/mllib/binary_classification_metrics_example.py index 8f0fc9d4..daf000e 100644 --- a/examples/src/main/python/mllib/binary_classification_metrics_example.py +++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py @@ -18,20 +18,25 @@ Binary Classification Metrics Example. """ from __future__ import print_function -from pyspark import SparkContext +from pyspark.sql import SparkSession # $example on$ from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.mllib.evaluation import BinaryClassificationMetrics -from pyspark.mllib.util import MLUtils +from pyspark.mllib.regression import LabeledPoint # $example off$ if __name__ == "__main__": -sc = SparkContext(appName="BinaryClassificationMetricsExample") +spark = SparkSession\ +.builder\ +.appName("BinaryClassificationMetricsExample")\ +.getOrCreate() # $example on$ # Several of the methods available in scala are currently missing from pyspark # Load training data in LIBSVM format -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt") +data = spark\ + .read.format("libsvm").load("data/mllib/sample_binary_classification_data.txt")\ +.rdd.map(lambda row: LabeledPoint(row[0], row[1])) # Split data into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=11L) @@ -53,4 +58,4 @@ if __name__ == "__main__": print("Area under ROC = %s" % metrics.areaUnderROC) # $example off$ -sc.stop() +spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/2c170dd3/examples/src/main/python/sql.py -- diff --git a/examples/src/main/python/sql.py b/examples/src/main/python/sql.py index 59a46cb..5594223 100644 --- a/examples/src/main/python/sql.py +++ b/examples/src/main/python/sql.py @@ -25,7 +25,10 @@ from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerT if __name__ ==
[1/2] spark git commit: [SPARK-15134][EXAMPLE] Indent SparkSession builder patterns and update binary_classification_metrics_example.py
Repository: spark Updated Branches: refs/heads/branch-2.0 e78b31b72 -> 8b4ab590c http://git-wip-us.apache.org/repos/asf/spark/blob/8b4ab590/examples/src/main/python/ml/vector_indexer_example.py -- diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py index 3cf5b8e..9b00e0f 100644 --- a/examples/src/main/python/ml/vector_indexer_example.py +++ b/examples/src/main/python/ml/vector_indexer_example.py @@ -23,7 +23,10 @@ from pyspark.ml.feature import VectorIndexer from pyspark.sql import SparkSession if __name__ == "__main__": -spark = SparkSession.builder.appName("VectorIndexerExample").getOrCreate() +spark = SparkSession\ +.builder\ +.appName("VectorIndexerExample")\ +.getOrCreate() # $example on$ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") http://git-wip-us.apache.org/repos/asf/spark/blob/8b4ab590/examples/src/main/python/ml/vector_slicer_example.py -- diff --git a/examples/src/main/python/ml/vector_slicer_example.py b/examples/src/main/python/ml/vector_slicer_example.py index 0531bcd..b833a89 100644 --- a/examples/src/main/python/ml/vector_slicer_example.py +++ b/examples/src/main/python/ml/vector_slicer_example.py @@ -25,7 +25,10 @@ from pyspark.sql.types import Row from pyspark.sql import SparkSession if __name__ == "__main__": -spark = SparkSession.builder.appName("VectorSlicerExample").getOrCreate() +spark = SparkSession\ +.builder\ +.appName("VectorSlicerExample")\ +.getOrCreate() # $example on$ df = spark.createDataFrame([ http://git-wip-us.apache.org/repos/asf/spark/blob/8b4ab590/examples/src/main/python/ml/word2vec_example.py -- diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py index 6766a7b..66500be 100644 --- a/examples/src/main/python/ml/word2vec_example.py +++ b/examples/src/main/python/ml/word2vec_example.py @@ -23,7 +23,10 @@ from pyspark.ml.feature import Word2Vec from pyspark.sql import SparkSession if __name__ == "__main__": -spark = SparkSession.builder.appName("Word2VecExample").getOrCreate() +spark = SparkSession\ +.builder\ +.appName("Word2VecExample")\ +.getOrCreate() # $example on$ # Input data: Each row is a bag of words from a sentence or document. http://git-wip-us.apache.org/repos/asf/spark/blob/8b4ab590/examples/src/main/python/mllib/binary_classification_metrics_example.py -- diff --git a/examples/src/main/python/mllib/binary_classification_metrics_example.py b/examples/src/main/python/mllib/binary_classification_metrics_example.py index 8f0fc9d4..daf000e 100644 --- a/examples/src/main/python/mllib/binary_classification_metrics_example.py +++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py @@ -18,20 +18,25 @@ Binary Classification Metrics Example. """ from __future__ import print_function -from pyspark import SparkContext +from pyspark.sql import SparkSession # $example on$ from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.mllib.evaluation import BinaryClassificationMetrics -from pyspark.mllib.util import MLUtils +from pyspark.mllib.regression import LabeledPoint # $example off$ if __name__ == "__main__": -sc = SparkContext(appName="BinaryClassificationMetricsExample") +spark = SparkSession\ +.builder\ +.appName("BinaryClassificationMetricsExample")\ +.getOrCreate() # $example on$ # Several of the methods available in scala are currently missing from pyspark # Load training data in LIBSVM format -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt") +data = spark\ + .read.format("libsvm").load("data/mllib/sample_binary_classification_data.txt")\ +.rdd.map(lambda row: LabeledPoint(row[0], row[1])) # Split data into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=11L) @@ -53,4 +58,4 @@ if __name__ == "__main__": print("Area under ROC = %s" % metrics.areaUnderROC) # $example off$ -sc.stop() +spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/8b4ab590/examples/src/main/python/sql.py -- diff --git a/examples/src/main/python/sql.py b/examples/src/main/python/sql.py index 59a46cb..5594223 100644 --- a/examples/src/main/python/sql.py +++ b/examples/src/main/python/sql.py @@ -25,7 +25,10 @@ from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerT if __name__ ==
[2/2] spark git commit: [SPARK-15134][EXAMPLE] Indent SparkSession builder patterns and update binary_classification_metrics_example.py
[SPARK-15134][EXAMPLE] Indent SparkSession builder patterns and update binary_classification_metrics_example.py ## What changes were proposed in this pull request? This issue addresses the comments in SPARK-15031 and also fix java-linter errors. - Use multiline format in SparkSession builder patterns. - Update `binary_classification_metrics_example.py` to use `SparkSession`. - Fix Java Linter errors (in SPARK-13745, SPARK-15031, and so far) ## How was this patch tested? After passing the Jenkins tests and run `dev/lint-java` manually. Author: Dongjoon HyunCloses #12911 from dongjoon-hyun/SPARK-15134. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2c170dd3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2c170dd3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2c170dd3 Branch: refs/heads/master Commit: 2c170dd3d731bd848d62265431795e1c141d75d7 Parents: bb9991d Author: Dongjoon Hyun Authored: Thu May 5 14:37:50 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:37:50 2016 -0700 -- .../network/shuffle/ExternalShuffleBlockHandler.java | 7 +-- .../ml/JavaAFTSurvivalRegressionExample.java | 5 - .../org/apache/spark/examples/ml/JavaALSExample.java | 5 - .../spark/examples/ml/JavaBinarizerExample.java | 8 .../examples/ml/JavaBisectingKMeansExample.java | 5 - .../spark/examples/ml/JavaBucketizerExample.java | 5 - .../spark/examples/ml/JavaChiSqSelectorExample.java | 8 .../examples/ml/JavaCountVectorizerExample.java | 5 - .../org/apache/spark/examples/ml/JavaDCTExample.java | 8 .../spark/examples/ml/JavaDeveloperApiExample.java | 5 - .../examples/ml/JavaElementwiseProductExample.java | 7 +++ .../ml/JavaGradientBoostedTreeClassifierExample.java | 10 ++ .../spark/examples/ml/JavaIndexToStringExample.java | 5 - .../apache/spark/examples/ml/JavaKMeansExample.java | 5 - .../org/apache/spark/examples/ml/JavaLDAExample.java | 5 - .../spark/examples/ml/JavaMaxAbsScalerExample.java | 14 ++ .../spark/examples/ml/JavaMinMaxScalerExample.java | 10 -- .../apache/spark/examples/ml/JavaNGramExample.java | 5 - .../spark/examples/ml/JavaNaiveBayesExample.java | 5 - .../spark/examples/ml/JavaNormalizerExample.java | 5 - .../spark/examples/ml/JavaOneHotEncoderExample.java | 5 - .../spark/examples/ml/JavaOneVsRestExample.java | 5 - .../org/apache/spark/examples/ml/JavaPCAExample.java | 5 - .../spark/examples/ml/JavaPipelineExample.java | 5 - .../examples/ml/JavaPolynomialExpansionExample.java | 5 - .../spark/examples/ml/JavaRFormulaExample.java | 5 - .../spark/examples/ml/JavaSQLTransformerExample.java | 5 - .../spark/examples/ml/JavaSimpleParamsExample.java | 5 - .../spark/examples/ml/JavaStandardScalerExample.java | 5 - .../examples/ml/JavaStopWordsRemoverExample.java | 5 - .../spark/examples/ml/JavaStringIndexerExample.java | 5 - .../apache/spark/examples/ml/JavaTfIdfExample.java | 5 - .../spark/examples/ml/JavaTokenizerExample.java | 5 - .../examples/ml/JavaVectorAssemblerExample.java | 5 - .../spark/examples/ml/JavaVectorIndexerExample.java | 5 - .../spark/examples/ml/JavaVectorSlicerExample.java | 5 - .../spark/examples/ml/JavaWord2VecExample.java | 5 - .../org/apache/spark/examples/sql/JavaSparkSQL.java | 8 ++-- .../examples/streaming/JavaSqlNetworkWordCount.java | 5 - examples/src/main/python/ml/als_example.py | 5 - examples/src/main/python/ml/binarizer_example.py | 5 - .../src/main/python/ml/bisecting_k_means_example.py | 5 - examples/src/main/python/ml/bucketizer_example.py| 5 - .../src/main/python/ml/chisq_selector_example.py | 5 - .../src/main/python/ml/count_vectorizer_example.py | 5 - examples/src/main/python/ml/cross_validator.py | 5 - examples/src/main/python/ml/dataframe_example.py | 5 - examples/src/main/python/ml/dct_example.py | 5 - .../ml/decision_tree_classification_example.py | 5 - .../python/ml/decision_tree_regression_example.py| 5 - .../main/python/ml/elementwise_product_example.py| 5 - .../python/ml/estimator_transformer_param_example.py | 5 - .../ml/gradient_boosted_tree_classifier_example.py | 5 - .../ml/gradient_boosted_tree_regressor_example.py| 5 - .../src/main/python/ml/index_to_string_example.py| 5 - examples/src/main/python/ml/kmeans_example.py| 5 - .../python/ml/linear_regression_with_elastic_net.py | 5 -
[2/2] spark git commit: [SPARK-15134][EXAMPLE] Indent SparkSession builder patterns and update binary_classification_metrics_example.py
[SPARK-15134][EXAMPLE] Indent SparkSession builder patterns and update binary_classification_metrics_example.py ## What changes were proposed in this pull request? This issue addresses the comments in SPARK-15031 and also fix java-linter errors. - Use multiline format in SparkSession builder patterns. - Update `binary_classification_metrics_example.py` to use `SparkSession`. - Fix Java Linter errors (in SPARK-13745, SPARK-15031, and so far) ## How was this patch tested? After passing the Jenkins tests and run `dev/lint-java` manually. Author: Dongjoon HyunCloses #12911 from dongjoon-hyun/SPARK-15134. (cherry picked from commit 2c170dd3d731bd848d62265431795e1c141d75d7) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8b4ab590 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8b4ab590 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8b4ab590 Branch: refs/heads/branch-2.0 Commit: 8b4ab590cb18b926c71c4cb4ec5b184b1b566770 Parents: e78b31b Author: Dongjoon Hyun Authored: Thu May 5 14:37:50 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:38:02 2016 -0700 -- .../network/shuffle/ExternalShuffleBlockHandler.java | 7 +-- .../ml/JavaAFTSurvivalRegressionExample.java | 5 - .../org/apache/spark/examples/ml/JavaALSExample.java | 5 - .../spark/examples/ml/JavaBinarizerExample.java | 8 .../examples/ml/JavaBisectingKMeansExample.java | 5 - .../spark/examples/ml/JavaBucketizerExample.java | 5 - .../spark/examples/ml/JavaChiSqSelectorExample.java | 8 .../examples/ml/JavaCountVectorizerExample.java | 5 - .../org/apache/spark/examples/ml/JavaDCTExample.java | 8 .../spark/examples/ml/JavaDeveloperApiExample.java | 5 - .../examples/ml/JavaElementwiseProductExample.java | 7 +++ .../ml/JavaGradientBoostedTreeClassifierExample.java | 10 ++ .../spark/examples/ml/JavaIndexToStringExample.java | 5 - .../apache/spark/examples/ml/JavaKMeansExample.java | 5 - .../org/apache/spark/examples/ml/JavaLDAExample.java | 5 - .../spark/examples/ml/JavaMaxAbsScalerExample.java | 14 ++ .../spark/examples/ml/JavaMinMaxScalerExample.java | 10 -- .../apache/spark/examples/ml/JavaNGramExample.java | 5 - .../spark/examples/ml/JavaNaiveBayesExample.java | 5 - .../spark/examples/ml/JavaNormalizerExample.java | 5 - .../spark/examples/ml/JavaOneHotEncoderExample.java | 5 - .../spark/examples/ml/JavaOneVsRestExample.java | 5 - .../org/apache/spark/examples/ml/JavaPCAExample.java | 5 - .../spark/examples/ml/JavaPipelineExample.java | 5 - .../examples/ml/JavaPolynomialExpansionExample.java | 5 - .../spark/examples/ml/JavaRFormulaExample.java | 5 - .../spark/examples/ml/JavaSQLTransformerExample.java | 5 - .../spark/examples/ml/JavaSimpleParamsExample.java | 5 - .../spark/examples/ml/JavaStandardScalerExample.java | 5 - .../examples/ml/JavaStopWordsRemoverExample.java | 5 - .../spark/examples/ml/JavaStringIndexerExample.java | 5 - .../apache/spark/examples/ml/JavaTfIdfExample.java | 5 - .../spark/examples/ml/JavaTokenizerExample.java | 5 - .../examples/ml/JavaVectorAssemblerExample.java | 5 - .../spark/examples/ml/JavaVectorIndexerExample.java | 5 - .../spark/examples/ml/JavaVectorSlicerExample.java | 5 - .../spark/examples/ml/JavaWord2VecExample.java | 5 - .../org/apache/spark/examples/sql/JavaSparkSQL.java | 8 ++-- .../examples/streaming/JavaSqlNetworkWordCount.java | 5 - examples/src/main/python/ml/als_example.py | 5 - examples/src/main/python/ml/binarizer_example.py | 5 - .../src/main/python/ml/bisecting_k_means_example.py | 5 - examples/src/main/python/ml/bucketizer_example.py| 5 - .../src/main/python/ml/chisq_selector_example.py | 5 - .../src/main/python/ml/count_vectorizer_example.py | 5 - examples/src/main/python/ml/cross_validator.py | 5 - examples/src/main/python/ml/dataframe_example.py | 5 - examples/src/main/python/ml/dct_example.py | 5 - .../ml/decision_tree_classification_example.py | 5 - .../python/ml/decision_tree_regression_example.py| 5 - .../main/python/ml/elementwise_product_example.py| 5 - .../python/ml/estimator_transformer_param_example.py | 5 - .../ml/gradient_boosted_tree_classifier_example.py | 5 - .../ml/gradient_boosted_tree_regressor_example.py| 5 - .../src/main/python/ml/index_to_string_example.py| 5 -
spark git commit: [SPARK-15152][DOC][MINOR] Scaladoc and Code style Improvements
Repository: spark Updated Branches: refs/heads/branch-2.0 1064a3303 -> a1887f213 [SPARK-15152][DOC][MINOR] Scaladoc and Code style Improvements ## What changes were proposed in this pull request? Minor doc and code style fixes ## How was this patch tested? local build Author: Jacek LaskowskiCloses #12928 from jaceklaskowski/SPARK-15152. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a1887f21 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a1887f21 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a1887f21 Branch: refs/heads/branch-2.0 Commit: a1887f2139dbcbd356bff8d7530769a1d1e7a9b5 Parents: 1064a33 Author: Jacek Laskowski Authored: Thu May 5 16:34:27 2016 -0700 Committer: Andrew Or Committed: Thu May 5 16:35:49 2016 -0700 -- .../scala/org/apache/spark/Accumulator.scala| 9 ++-- .../scala/org/apache/spark/scheduler/Pool.scala | 9 ++-- .../spark/scheduler/SchedulingAlgorithm.scala | 13 ++ .../apache/spark/util/ShutdownHookManager.scala | 4 +- .../org/apache/spark/ml/feature/Binarizer.scala | 6 +-- .../org/apache/spark/mllib/util/MLUtils.scala | 5 +- .../sql/catalyst/planning/QueryPlanner.scala| 9 ++-- .../apache/spark/sql/execution/ExpandExec.scala | 2 +- .../apache/spark/sql/execution/SparkPlan.scala | 12 +++-- .../sql/execution/WholeStageCodegenExec.scala | 6 +-- .../apache/spark/sql/execution/objects.scala| 2 +- .../execution/streaming/FileStreamSource.scala | 2 +- .../streaming/IncrementalExecution.scala| 2 +- .../org/apache/spark/streaming/Checkpoint.scala | 49 ++-- .../org/apache/spark/deploy/yarn/Client.scala | 4 +- 15 files changed, 66 insertions(+), 68 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a1887f21/core/src/main/scala/org/apache/spark/Accumulator.scala -- diff --git a/core/src/main/scala/org/apache/spark/Accumulator.scala b/core/src/main/scala/org/apache/spark/Accumulator.scala index 2324504..9d1f1d5 100644 --- a/core/src/main/scala/org/apache/spark/Accumulator.scala +++ b/core/src/main/scala/org/apache/spark/Accumulator.scala @@ -24,16 +24,17 @@ package org.apache.spark * They can be used to implement counters (as in MapReduce) or sums. Spark natively supports * accumulators of numeric value types, and programmers can add support for new types. * - * An accumulator is created from an initial value `v` by calling [[SparkContext#accumulator]]. - * Tasks running on the cluster can then add to it using the [[Accumulable#+=]] operator. + * An accumulator is created from an initial value `v` by calling + * [[SparkContext#accumulator SparkContext.accumulator]]. + * Tasks running on the cluster can then add to it using the [[Accumulable#+= +=]] operator. * However, they cannot read its value. Only the driver program can read the accumulator's value, - * using its value method. + * using its [[#value]] method. * * The interpreter session below shows an accumulator being used to add up the elements of an array: * * {{{ * scala> val accum = sc.accumulator(0) - * accum: spark.Accumulator[Int] = 0 + * accum: org.apache.spark.Accumulator[Int] = 0 * * scala> sc.parallelize(Array(1, 2, 3, 4)).foreach(x => accum += x) * ... http://git-wip-us.apache.org/repos/asf/spark/blob/a1887f21/core/src/main/scala/org/apache/spark/scheduler/Pool.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala index a79e71e..5987cfe 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala @@ -26,16 +26,14 @@ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.SchedulingMode.SchedulingMode /** - * An Schedulable entity that represent collection of Pools or TaskSetManagers + * An Schedulable entity that represents collection of Pools or TaskSetManagers */ - private[spark] class Pool( val poolName: String, val schedulingMode: SchedulingMode, initMinShare: Int, initWeight: Int) - extends Schedulable - with Logging { + extends Schedulable with Logging { val schedulableQueue = new ConcurrentLinkedQueue[Schedulable] val schedulableNameToSchedulable = new ConcurrentHashMap[String, Schedulable] @@ -56,7 +54,8 @@ private[spark] class Pool( case SchedulingMode.FIFO => new FIFOSchedulingAlgorithm() case _ => -throw new IllegalArgumentException(s"Unsupported spark.scheduler.mode: $schedulingMode") +val msg = "Unsupported
spark git commit: [SPARK-14124][SQL][FOLLOWUP] Implement Database-related DDL Commands
Repository: spark Updated Branches: refs/heads/branch-2.0 b063d9b71 -> fe268ee1e [SPARK-14124][SQL][FOLLOWUP] Implement Database-related DDL Commands What changes were proposed in this pull request? First, a few test cases failed in mac OS X because the property value of `java.io.tmpdir` does not include a trailing slash on some platform. Hive always removes the last trailing slash. For example, what I got in the web: ``` Win NT --> C:\TEMP\ Win XP --> C:\TEMP Solaris --> /var/tmp/ Linux --> /var/tmp ``` Second, a couple of test cases are added to verify if the commands work properly. How was this patch tested? Added a test case for it and correct the previous test cases. Author: gatorsmileAuthor: xiaoli Author: Xiao Li Closes #12081 from gatorsmile/mkdir. (cherry picked from commit 8cba57a75cf9e29b54d97366a039a97a2f305d5d) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fe268ee1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fe268ee1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fe268ee1 Branch: refs/heads/branch-2.0 Commit: fe268ee1e4698ac15fa4014556f4d7d4e1d9f349 Parents: b063d9b Author: gatorsmile Authored: Thu May 5 14:34:24 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:34:34 2016 -0700 -- .../sql/catalyst/catalog/SessionCatalog.scala | 4 + .../spark/sql/execution/command/ddl.scala | 5 +- .../spark/sql/execution/command/DDLSuite.scala | 249 --- .../spark/sql/hive/execution/HiveDDLSuite.scala | 150 ++- 4 files changed, 311 insertions(+), 97 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fe268ee1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index ff63034..eff420e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -146,6 +146,10 @@ class SessionCatalog( currentDb = db } + /** + * Get the path for creating a non-default database when database location is not provided + * by users. + */ def getDefaultDBPath(db: String): String = { val database = if (conf.caseSensitiveAnalysis) db else db.toLowerCase new Path(new Path(conf.warehousePath), database + ".db").toString http://git-wip-us.apache.org/repos/asf/spark/blob/fe268ee1/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index aa06c01..085bdaf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -40,7 +40,10 @@ import org.apache.spark.sql.types._ * unless 'ifNotExists' is true. * The syntax of using this command in SQL is: * {{{ - *CREATE DATABASE|SCHEMA [IF NOT EXISTS] database_name + * CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name + * [COMMENT database_comment] + * [LOCATION database_directory] + * [WITH DBPROPERTIES (property_name=property_value, ...)]; * }}} */ case class CreateDatabase( http://git-wip-us.apache.org/repos/asf/spark/blob/fe268ee1/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 0ae099e..6085098 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -95,49 +95,81 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { catalog.createPartitions(tableName, Seq(part), ignoreIfExists = false) } + private def appendTrailingSlash(path: String): String = { +if (!path.endsWith(File.separator)) path + File.separator else path + } + test("the qualified path of a database is stored in the catalog") { val catalog =
spark git commit: [SPARK-15072][SQL][REPL][EXAMPLES] Remove SparkSession.withHiveSupport
Repository: spark Updated Branches: refs/heads/master 8cba57a75 -> ed6f3f8a5 [SPARK-15072][SQL][REPL][EXAMPLES] Remove SparkSession.withHiveSupport ## What changes were proposed in this pull request? Removing the `withHiveSupport` method of `SparkSession`, instead use `enableHiveSupport` ## How was this patch tested? ran tests locally Author: Sandeep SinghCloses #12851 from techaddict/SPARK-15072. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ed6f3f8a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ed6f3f8a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ed6f3f8a Branch: refs/heads/master Commit: ed6f3f8a5f3a6bf7c53e13c2798de398c9a526a6 Parents: 8cba57a Author: Sandeep Singh Authored: Thu May 5 14:35:15 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:35:15 2016 -0700 -- .../spark/examples/sql/hive/HiveFromSpark.scala | 14 +- .../scala/org/apache/spark/sql/SparkSession.scala | 13 - .../spark/sql/hive/thriftserver/SparkSQLEnv.scala | 10 ++ .../apache/spark/sql/hive/HiveSparkSubmitSuite.scala | 7 +-- 4 files changed, 20 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ed6f3f8a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala index ff33091..a15cf5d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala @@ -36,15 +36,19 @@ object HiveFromSpark { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("HiveFromSpark") -val sc = new SparkContext(sparkConf) // A hive context adds support for finding tables in the MetaStore and writing queries // using HiveQL. Users who do not have an existing Hive deployment can still create a // HiveContext. When not configured by the hive-site.xml, the context automatically // creates metastore_db and warehouse in the current directory. -val sparkSession = SparkSession.withHiveSupport(sc) -import sparkSession.implicits._ -import sparkSession.sql +val spark = SparkSession.builder + .config(sparkConf) + .enableHiveSupport() + .getOrCreate() +val sc = spark.sparkContext + +import spark.implicits._ +import spark.sql sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") sql(s"LOAD DATA LOCAL INPATH '${kv1File.getAbsolutePath}' INTO TABLE src") @@ -74,7 +78,7 @@ object HiveFromSpark { println("Result of SELECT *:") sql("SELECT * FROM records r JOIN src s ON r.key = s.key").collect().foreach(println) -sc.stop() +spark.stop() } } // scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/ed6f3f8a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index aa7c335..9ed3756 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -816,17 +816,4 @@ object SparkSession { } } - /** - * Create a new [[SparkSession]] with a catalog backed by Hive. - */ - def withHiveSupport(sc: SparkContext): SparkSession = { -if (hiveClassesArePresent) { - sc.conf.set(CATALOG_IMPLEMENTATION.key, "hive") - new SparkSession(sc) -} else { - throw new IllegalArgumentException( -"Unable to instantiate SparkSession with Hive support because Hive classes are not found.") -} - } - } http://git-wip-us.apache.org/repos/asf/spark/blob/ed6f3f8a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala -- diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala index 665a44e..8de223f 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala @@ -54,13 +54,15 @@ private[hive] object SparkSQLEnv extends
spark git commit: [SPARK-15152][DOC][MINOR] Scaladoc and Code style Improvements
Repository: spark Updated Branches: refs/heads/master 02c07e899 -> bbb777343 [SPARK-15152][DOC][MINOR] Scaladoc and Code style Improvements ## What changes were proposed in this pull request? Minor doc and code style fixes ## How was this patch tested? local build Author: Jacek LaskowskiCloses #12928 from jaceklaskowski/SPARK-15152. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bbb77734 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bbb77734 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bbb77734 Branch: refs/heads/master Commit: bbb77734374010e36731bf6db1fac0273de8206d Parents: 02c07e8 Author: Jacek Laskowski Authored: Thu May 5 16:34:27 2016 -0700 Committer: Andrew Or Committed: Thu May 5 16:34:27 2016 -0700 -- .../scala/org/apache/spark/Accumulator.scala| 9 ++-- .../scala/org/apache/spark/scheduler/Pool.scala | 9 ++-- .../spark/scheduler/SchedulingAlgorithm.scala | 13 ++ .../apache/spark/util/ShutdownHookManager.scala | 4 +- .../org/apache/spark/ml/feature/Binarizer.scala | 6 +-- .../org/apache/spark/mllib/util/MLUtils.scala | 5 +- .../sql/catalyst/planning/QueryPlanner.scala| 9 ++-- .../apache/spark/sql/execution/ExpandExec.scala | 2 +- .../apache/spark/sql/execution/SparkPlan.scala | 12 +++-- .../sql/execution/WholeStageCodegenExec.scala | 6 +-- .../apache/spark/sql/execution/objects.scala| 2 +- .../execution/streaming/FileStreamSource.scala | 2 +- .../streaming/IncrementalExecution.scala| 2 +- .../org/apache/spark/streaming/Checkpoint.scala | 49 ++-- .../org/apache/spark/deploy/yarn/Client.scala | 4 +- 15 files changed, 66 insertions(+), 68 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bbb77734/core/src/main/scala/org/apache/spark/Accumulator.scala -- diff --git a/core/src/main/scala/org/apache/spark/Accumulator.scala b/core/src/main/scala/org/apache/spark/Accumulator.scala index 2324504..9d1f1d5 100644 --- a/core/src/main/scala/org/apache/spark/Accumulator.scala +++ b/core/src/main/scala/org/apache/spark/Accumulator.scala @@ -24,16 +24,17 @@ package org.apache.spark * They can be used to implement counters (as in MapReduce) or sums. Spark natively supports * accumulators of numeric value types, and programmers can add support for new types. * - * An accumulator is created from an initial value `v` by calling [[SparkContext#accumulator]]. - * Tasks running on the cluster can then add to it using the [[Accumulable#+=]] operator. + * An accumulator is created from an initial value `v` by calling + * [[SparkContext#accumulator SparkContext.accumulator]]. + * Tasks running on the cluster can then add to it using the [[Accumulable#+= +=]] operator. * However, they cannot read its value. Only the driver program can read the accumulator's value, - * using its value method. + * using its [[#value]] method. * * The interpreter session below shows an accumulator being used to add up the elements of an array: * * {{{ * scala> val accum = sc.accumulator(0) - * accum: spark.Accumulator[Int] = 0 + * accum: org.apache.spark.Accumulator[Int] = 0 * * scala> sc.parallelize(Array(1, 2, 3, 4)).foreach(x => accum += x) * ... http://git-wip-us.apache.org/repos/asf/spark/blob/bbb77734/core/src/main/scala/org/apache/spark/scheduler/Pool.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala index a79e71e..5987cfe 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala @@ -26,16 +26,14 @@ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.SchedulingMode.SchedulingMode /** - * An Schedulable entity that represent collection of Pools or TaskSetManagers + * An Schedulable entity that represents collection of Pools or TaskSetManagers */ - private[spark] class Pool( val poolName: String, val schedulingMode: SchedulingMode, initMinShare: Int, initWeight: Int) - extends Schedulable - with Logging { + extends Schedulable with Logging { val schedulableQueue = new ConcurrentLinkedQueue[Schedulable] val schedulableNameToSchedulable = new ConcurrentHashMap[String, Schedulable] @@ -56,7 +54,8 @@ private[spark] class Pool( case SchedulingMode.FIFO => new FIFOSchedulingAlgorithm() case _ => -throw new IllegalArgumentException(s"Unsupported spark.scheduler.mode: $schedulingMode") +val msg = "Unsupported scheduling
spark git commit: [SPARK-13566][CORE] Avoid deadlock between BlockManager and Executor Thread
Repository: spark Updated Branches: refs/heads/branch-1.6 a3aa22a59 -> ab006523b [SPARK-13566][CORE] Avoid deadlock between BlockManager and Executor Thread Temp patch for branch 1.6, avoid deadlock between BlockManager and Executor Thread. Author: cenyuhaiCloses #11546 from cenyuhai/SPARK-13566. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab006523 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab006523 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab006523 Branch: refs/heads/branch-1.6 Commit: ab006523b840b1d2dbf3f5ff0a238558e7665a1e Parents: a3aa22a Author: cenyuhai Authored: Fri May 6 13:50:49 2016 -0700 Committer: Andrew Or Committed: Fri May 6 13:50:49 2016 -0700 -- .../org/apache/spark/executor/Executor.scala| 12 ++ .../org/apache/spark/storage/BlockManager.scala | 192 --- .../spark/storage/BlockManagerSuite.scala | 38 3 files changed, 170 insertions(+), 72 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ab006523/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index ab5bde5..b248e12 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -218,6 +218,7 @@ private[spark] class Executor( threwException = false res } finally { + val releasedLocks = env.blockManager.releaseAllLocksForTask(taskId) val freedMemory = taskMemoryManager.cleanUpAllAllocatedMemory() if (freedMemory > 0) { val errMsg = s"Managed memory leak detected; size = $freedMemory bytes, TID = $taskId" @@ -227,6 +228,17 @@ private[spark] class Executor( logError(errMsg) } } + + if (releasedLocks.nonEmpty) { +val errMsg = + s"${releasedLocks.size} block locks were not released by TID = $taskId:\n" + + releasedLocks.mkString("[", ", ", "]") +if (conf.getBoolean("spark.storage.exceptionOnPinLeak", false) && !threwException) { + throw new SparkException(errMsg) +} else { + logError(errMsg) +} + } } val taskFinish = System.currentTimeMillis() http://git-wip-us.apache.org/repos/asf/spark/blob/ab006523/core/src/main/scala/org/apache/spark/storage/BlockManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 538272d..288f756 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -19,12 +19,14 @@ package org.apache.spark.storage import java.io._ import java.nio.{ByteBuffer, MappedByteBuffer} +import java.util.concurrent.ConcurrentHashMap import scala.collection.mutable.{ArrayBuffer, HashMap} import scala.concurrent.duration._ import scala.concurrent.{Await, ExecutionContext, Future} import scala.util.Random import scala.util.control.NonFatal +import scala.collection.JavaConverters._ import sun.nio.ch.DirectBuffer @@ -65,7 +67,7 @@ private[spark] class BlockManager( val master: BlockManagerMaster, defaultSerializer: Serializer, val conf: SparkConf, -memoryManager: MemoryManager, +val memoryManager: MemoryManager, mapOutputTracker: MapOutputTracker, shuffleManager: ShuffleManager, blockTransferService: BlockTransferService, @@ -164,6 +166,11 @@ private[spark] class BlockManager( * loaded yet. */ private lazy val compressionCodec: CompressionCodec = CompressionCodec.createCodec(conf) + // Blocks are removing by another thread + val pendingToRemove = new ConcurrentHashMap[BlockId, Long]() + + private val NON_TASK_WRITER = -1024L + /** * Initializes the BlockManager with the given appId. This is not performed in the constructor as * the appId may not be known at BlockManager instantiation time (in particular for the driver, @@ -1025,54 +1032,58 @@ private[spark] class BlockManager( val info = blockInfo.get(blockId).orNull // If the block has not already been dropped -if (info != null) { - info.synchronized { -// required ? As of now, this will be invoked only for blocks which are ready -// But in case this changes in future, adding for
spark git commit: [SPARK-14896][SQL] Deprecate HiveContext in python
Repository: spark Updated Branches: refs/heads/master b28137764 -> fa79d346e [SPARK-14896][SQL] Deprecate HiveContext in python ## What changes were proposed in this pull request? See title. ## How was this patch tested? PySpark tests. Author: Andrew Or <and...@databricks.com> Closes #12917 from andrewor14/deprecate-hive-context-python. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fa79d346 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fa79d346 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fa79d346 Branch: refs/heads/master Commit: fa79d346e1a79ceda6ccd20e74eb850e769556ea Parents: b281377 Author: Andrew Or <and...@databricks.com> Authored: Wed May 4 17:39:30 2016 -0700 Committer: Andrew Or <and...@databricks.com> Committed: Wed May 4 17:39:30 2016 -0700 -- python/pyspark/sql/column.py| 2 -- python/pyspark/sql/context.py | 9 - python/pyspark/sql/streaming.py | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fa79d346/python/pyspark/sql/column.py -- diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 43e9bae..90fb76f 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -418,8 +418,6 @@ class Column(object): >>> window = Window.partitionBy("name").orderBy("age").rowsBetween(-1, 1) >>> from pyspark.sql.functions import rank, min >>> # df.select(rank().over(window), min('age').over(window)) - -.. note:: Window functions is only supported with HiveContext in 1.4 """ from pyspark.sql.window import WindowSpec if not isinstance(window, WindowSpec): http://git-wip-us.apache.org/repos/asf/spark/blob/fa79d346/python/pyspark/sql/context.py -- diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index 2096236..78ab2e8 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -17,6 +17,7 @@ from __future__ import print_function import sys +import warnings if sys.version >= '3': basestring = unicode = str @@ -434,7 +435,6 @@ class SQLContext(object): return ContinuousQueryManager(self._ssql_ctx.streams()) -# TODO(andrew): deprecate this class HiveContext(SQLContext): """A variant of Spark SQL that integrates with data stored in Hive. @@ -444,8 +444,15 @@ class HiveContext(SQLContext): :param sparkContext: The SparkContext to wrap. :param jhiveContext: An optional JVM Scala HiveContext. If set, we do not instantiate a new :class:`HiveContext` in the JVM, instead we make all calls to this object. + +.. note:: Deprecated in 2.0.0. Use SparkSession.builder.enableHiveSupport().getOrCreate(). """ +warnings.warn( +"HiveContext is deprecated in Spark 2.0.0. Please use " + +"SparkSession.builder.enableHiveSupport().getOrCreate() instead.", +DeprecationWarning) + def __init__(self, sparkContext, jhiveContext=None): if jhiveContext is None: sparkSession = SparkSession.withHiveSupport(sparkContext) http://git-wip-us.apache.org/repos/asf/spark/blob/fa79d346/python/pyspark/sql/streaming.py -- diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index bf03fdc..8238b8e 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -221,7 +221,7 @@ def _test(): globs['os'] = os globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) -globs['hiveContext'] = HiveContext(sc) +globs['hiveContext'] = HiveContext._createForTesting(sc) globs['df'] = \ globs['sqlContext'].read.format('text').stream('python/test_support/sql/streaming') - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14896][SQL] Deprecate HiveContext in python
Repository: spark Updated Branches: refs/heads/branch-2.0 aca46ecf8 -> fa3c5507f [SPARK-14896][SQL] Deprecate HiveContext in python ## What changes were proposed in this pull request? See title. ## How was this patch tested? PySpark tests. Author: Andrew Or <and...@databricks.com> Closes #12917 from andrewor14/deprecate-hive-context-python. (cherry picked from commit fa79d346e1a79ceda6ccd20e74eb850e769556ea) Signed-off-by: Andrew Or <and...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fa3c5507 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fa3c5507 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fa3c5507 Branch: refs/heads/branch-2.0 Commit: fa3c5507fb7b612f78750abfd60dfdde0ad86da3 Parents: aca46ec Author: Andrew Or <and...@databricks.com> Authored: Wed May 4 17:39:30 2016 -0700 Committer: Andrew Or <and...@databricks.com> Committed: Wed May 4 17:39:41 2016 -0700 -- python/pyspark/sql/column.py| 2 -- python/pyspark/sql/context.py | 9 - python/pyspark/sql/streaming.py | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fa3c5507/python/pyspark/sql/column.py -- diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 43e9bae..90fb76f 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -418,8 +418,6 @@ class Column(object): >>> window = Window.partitionBy("name").orderBy("age").rowsBetween(-1, 1) >>> from pyspark.sql.functions import rank, min >>> # df.select(rank().over(window), min('age').over(window)) - -.. note:: Window functions is only supported with HiveContext in 1.4 """ from pyspark.sql.window import WindowSpec if not isinstance(window, WindowSpec): http://git-wip-us.apache.org/repos/asf/spark/blob/fa3c5507/python/pyspark/sql/context.py -- diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index 2096236..78ab2e8 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -17,6 +17,7 @@ from __future__ import print_function import sys +import warnings if sys.version >= '3': basestring = unicode = str @@ -434,7 +435,6 @@ class SQLContext(object): return ContinuousQueryManager(self._ssql_ctx.streams()) -# TODO(andrew): deprecate this class HiveContext(SQLContext): """A variant of Spark SQL that integrates with data stored in Hive. @@ -444,8 +444,15 @@ class HiveContext(SQLContext): :param sparkContext: The SparkContext to wrap. :param jhiveContext: An optional JVM Scala HiveContext. If set, we do not instantiate a new :class:`HiveContext` in the JVM, instead we make all calls to this object. + +.. note:: Deprecated in 2.0.0. Use SparkSession.builder.enableHiveSupport().getOrCreate(). """ +warnings.warn( +"HiveContext is deprecated in Spark 2.0.0. Please use " + +"SparkSession.builder.enableHiveSupport().getOrCreate() instead.", +DeprecationWarning) + def __init__(self, sparkContext, jhiveContext=None): if jhiveContext is None: sparkSession = SparkSession.withHiveSupport(sparkContext) http://git-wip-us.apache.org/repos/asf/spark/blob/fa3c5507/python/pyspark/sql/streaming.py -- diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index bf03fdc..8238b8e 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -221,7 +221,7 @@ def _test(): globs['os'] = os globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) -globs['hiveContext'] = HiveContext(sc) +globs['hiveContext'] = HiveContext._createForTesting(sc) globs['df'] = \ globs['sqlContext'].read.format('text').stream('python/test_support/sql/streaming') - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][SQL] Fix typo in DataFrameReader csv documentation
Repository: spark Updated Branches: refs/heads/master a432a2b86 -> b28137764 [MINOR][SQL] Fix typo in DataFrameReader csv documentation ## What changes were proposed in this pull request? Typo fix ## How was this patch tested? No tests My apologies for the tiny PR, but I stumbled across this today and wanted to get it corrected for 2.0. Author: sethahCloses #12912 from sethah/csv_typo. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b2813776 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b2813776 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b2813776 Branch: refs/heads/master Commit: b28137764716f56fa1a923c4278624a56364a505 Parents: a432a2b Author: sethah Authored: Wed May 4 16:46:13 2016 -0700 Committer: Andrew Or Committed: Wed May 4 16:46:13 2016 -0700 -- sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b2813776/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 2d4a68f..5bf696c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -407,7 +407,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * `header` (default `false`): uses the first line as names of columns. * `ignoreLeadingWhiteSpace` (default `false`): defines whether or not leading whitespaces * from values being read should be skipped. - * `ignoreTrailingWhiteSpace` (default `fDataFraalse`): defines whether or not trailing + * `ignoreTrailingWhiteSpace` (default `false`): defines whether or not trailing * whitespaces from values being read should be skipped. * `nullValue` (default empty string): sets the string representation of a null value. * `nanValue` (default `NaN`): sets the string representation of a non-number" value. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][SQL] Fix typo in DataFrameReader csv documentation
Repository: spark Updated Branches: refs/heads/branch-2.0 701c66729 -> aca46ecf8 [MINOR][SQL] Fix typo in DataFrameReader csv documentation ## What changes were proposed in this pull request? Typo fix ## How was this patch tested? No tests My apologies for the tiny PR, but I stumbled across this today and wanted to get it corrected for 2.0. Author: sethahCloses #12912 from sethah/csv_typo. (cherry picked from commit b28137764716f56fa1a923c4278624a56364a505) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aca46ecf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aca46ecf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aca46ecf Branch: refs/heads/branch-2.0 Commit: aca46ecf8ebc1e477cf1ca8aabf45861bf12e225 Parents: 701c667 Author: sethah Authored: Wed May 4 16:46:13 2016 -0700 Committer: Andrew Or Committed: Wed May 4 16:46:25 2016 -0700 -- sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/aca46ecf/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 2d4a68f..5bf696c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -407,7 +407,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * `header` (default `false`): uses the first line as names of columns. * `ignoreLeadingWhiteSpace` (default `false`): defines whether or not leading whitespaces * from values being read should be skipped. - * `ignoreTrailingWhiteSpace` (default `fDataFraalse`): defines whether or not trailing + * `ignoreTrailingWhiteSpace` (default `false`): defines whether or not trailing * whitespaces from values being read should be skipped. * `nullValue` (default empty string): sets the string representation of a null value. * `nanValue` (default `NaN`): sets the string representation of a non-number" value. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14893][SQL] Re-enable HiveSparkSubmitSuite SPARK-8489 test after HiveContext is removed
Repository: spark Updated Branches: refs/heads/master 08db49126 -> 02c07e899 [SPARK-14893][SQL] Re-enable HiveSparkSubmitSuite SPARK-8489 test after HiveContext is removed ## What changes were proposed in this pull request? Enable the test that was disabled when HiveContext was removed. ## How was this patch tested? Made sure the enabled test passes with the new jar. Author: Dilip BiswalCloses #12924 from dilipbiswal/spark-14893. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/02c07e89 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/02c07e89 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/02c07e89 Branch: refs/heads/master Commit: 02c07e8999dca545849cb3aa758a624dc51cd1e9 Parents: 08db491 Author: Dilip Biswal Authored: Thu May 5 14:44:45 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:44:45 2016 -0700 -- .../regression-test-SPARK-8489/Main.scala| 12 +++- .../regression-test-SPARK-8489/test-2.10.jar | Bin 6873 -> 6865 bytes .../regression-test-SPARK-8489/test-2.11.jar | Bin 7039 -> 7030 bytes .../spark/sql/hive/HiveSparkSubmitSuite.scala| 3 +-- 4 files changed, 8 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/02c07e89/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala -- diff --git a/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala b/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala index 10a017d..4fbbbac 100644 --- a/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala +++ b/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala @@ -15,7 +15,6 @@ * limitations under the License. */ -import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession /** @@ -33,15 +32,18 @@ object Main { def main(args: Array[String]) { // scalastyle:off println println("Running regression test for SPARK-8489.") -val sc = new SparkContext("local", "testing") -val sparkSession = SparkSession.withHiveSupport(sc) +val spark = SparkSession.builder + .master("local") + .appName("testing") + .enableHiveSupport() + .getOrCreate() // This line should not throw scala.reflect.internal.MissingRequirementError. // See SPARK-8470 for more detail. -val df = sparkSession.createDataFrame(Seq(MyCoolClass("1", "2", "3"))) +val df = spark.createDataFrame(Seq(MyCoolClass("1", "2", "3"))) df.collect() println("Regression test for SPARK-8489 success!") // scalastyle:on println -sc.stop() +spark.stop() } } http://git-wip-us.apache.org/repos/asf/spark/blob/02c07e89/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.10.jar -- diff --git a/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.10.jar b/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.10.jar index 26d410f..3f28d37 100644 Binary files a/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.10.jar and b/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.10.jar differ http://git-wip-us.apache.org/repos/asf/spark/blob/02c07e89/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.11.jar -- diff --git a/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.11.jar b/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.11.jar index f347847..5e09369 100644 Binary files a/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.11.jar and b/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.11.jar differ http://git-wip-us.apache.org/repos/asf/spark/blob/02c07e89/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index a320011..a717a99 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -142,8 +142,7 @@ class HiveSparkSubmitSuite runSparkSubmit(args) } - // TODO: re-enable this after rebuilding the jar (HiveContext was removed) - ignore("SPARK-8489: MissingRequirementError during reflection") { + test("SPARK-8489: MissingRequirementError during reflection") { // This test uses a pre-built jar to test SPARK-8489. In a nutshell,
spark git commit: [SPARK-14893][SQL] Re-enable HiveSparkSubmitSuite SPARK-8489 test after HiveContext is removed
Repository: spark Updated Branches: refs/heads/branch-2.0 80a4bfa4d -> 1064a3303 [SPARK-14893][SQL] Re-enable HiveSparkSubmitSuite SPARK-8489 test after HiveContext is removed ## What changes were proposed in this pull request? Enable the test that was disabled when HiveContext was removed. ## How was this patch tested? Made sure the enabled test passes with the new jar. Author: Dilip BiswalCloses #12924 from dilipbiswal/spark-14893. (cherry picked from commit 02c07e8999dca545849cb3aa758a624dc51cd1e9) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1064a330 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1064a330 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1064a330 Branch: refs/heads/branch-2.0 Commit: 1064a3303e72d92db02cb94eb2bb81245ac68fc6 Parents: 80a4bfa Author: Dilip Biswal Authored: Thu May 5 14:44:45 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:44:54 2016 -0700 -- .../regression-test-SPARK-8489/Main.scala| 12 +++- .../regression-test-SPARK-8489/test-2.10.jar | Bin 6873 -> 6865 bytes .../regression-test-SPARK-8489/test-2.11.jar | Bin 7039 -> 7030 bytes .../spark/sql/hive/HiveSparkSubmitSuite.scala| 3 +-- 4 files changed, 8 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1064a330/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala -- diff --git a/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala b/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala index 10a017d..4fbbbac 100644 --- a/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala +++ b/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala @@ -15,7 +15,6 @@ * limitations under the License. */ -import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession /** @@ -33,15 +32,18 @@ object Main { def main(args: Array[String]) { // scalastyle:off println println("Running regression test for SPARK-8489.") -val sc = new SparkContext("local", "testing") -val sparkSession = SparkSession.withHiveSupport(sc) +val spark = SparkSession.builder + .master("local") + .appName("testing") + .enableHiveSupport() + .getOrCreate() // This line should not throw scala.reflect.internal.MissingRequirementError. // See SPARK-8470 for more detail. -val df = sparkSession.createDataFrame(Seq(MyCoolClass("1", "2", "3"))) +val df = spark.createDataFrame(Seq(MyCoolClass("1", "2", "3"))) df.collect() println("Regression test for SPARK-8489 success!") // scalastyle:on println -sc.stop() +spark.stop() } } http://git-wip-us.apache.org/repos/asf/spark/blob/1064a330/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.10.jar -- diff --git a/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.10.jar b/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.10.jar index 26d410f..3f28d37 100644 Binary files a/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.10.jar and b/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.10.jar differ http://git-wip-us.apache.org/repos/asf/spark/blob/1064a330/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.11.jar -- diff --git a/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.11.jar b/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.11.jar index f347847..5e09369 100644 Binary files a/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.11.jar and b/sql/hive/src/test/resources/regression-test-SPARK-8489/test-2.11.jar differ http://git-wip-us.apache.org/repos/asf/spark/blob/1064a330/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index a320011..a717a99 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -142,8 +142,7 @@ class HiveSparkSubmitSuite runSparkSubmit(args) } - // TODO: re-enable this after rebuilding the jar (HiveContext was removed) - ignore("SPARK-8489: MissingRequirementError during reflection") { +
spark git commit: [SPARK-14124][SQL][FOLLOWUP] Implement Database-related DDL Commands
Repository: spark Updated Branches: refs/heads/master 63db2bd28 -> 8cba57a75 [SPARK-14124][SQL][FOLLOWUP] Implement Database-related DDL Commands What changes were proposed in this pull request? First, a few test cases failed in mac OS X because the property value of `java.io.tmpdir` does not include a trailing slash on some platform. Hive always removes the last trailing slash. For example, what I got in the web: ``` Win NT --> C:\TEMP\ Win XP --> C:\TEMP Solaris --> /var/tmp/ Linux --> /var/tmp ``` Second, a couple of test cases are added to verify if the commands work properly. How was this patch tested? Added a test case for it and correct the previous test cases. Author: gatorsmileAuthor: xiaoli Author: Xiao Li Closes #12081 from gatorsmile/mkdir. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8cba57a7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8cba57a7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8cba57a7 Branch: refs/heads/master Commit: 8cba57a75cf9e29b54d97366a039a97a2f305d5d Parents: 63db2bd Author: gatorsmile Authored: Thu May 5 14:34:24 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:34:24 2016 -0700 -- .../sql/catalyst/catalog/SessionCatalog.scala | 4 + .../spark/sql/execution/command/ddl.scala | 5 +- .../spark/sql/execution/command/DDLSuite.scala | 249 --- .../spark/sql/hive/execution/HiveDDLSuite.scala | 150 ++- 4 files changed, 311 insertions(+), 97 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8cba57a7/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index ff63034..eff420e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -146,6 +146,10 @@ class SessionCatalog( currentDb = db } + /** + * Get the path for creating a non-default database when database location is not provided + * by users. + */ def getDefaultDBPath(db: String): String = { val database = if (conf.caseSensitiveAnalysis) db else db.toLowerCase new Path(new Path(conf.warehousePath), database + ".db").toString http://git-wip-us.apache.org/repos/asf/spark/blob/8cba57a7/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index aa06c01..085bdaf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -40,7 +40,10 @@ import org.apache.spark.sql.types._ * unless 'ifNotExists' is true. * The syntax of using this command in SQL is: * {{{ - *CREATE DATABASE|SCHEMA [IF NOT EXISTS] database_name + * CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name + * [COMMENT database_comment] + * [LOCATION database_directory] + * [WITH DBPROPERTIES (property_name=property_value, ...)]; * }}} */ case class CreateDatabase( http://git-wip-us.apache.org/repos/asf/spark/blob/8cba57a7/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 0ae099e..6085098 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -95,49 +95,81 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { catalog.createPartitions(tableName, Seq(part), ignoreIfExists = false) } + private def appendTrailingSlash(path: String): String = { +if (!path.endsWith(File.separator)) path + File.separator else path + } + test("the qualified path of a database is stored in the catalog") { val catalog = sqlContext.sessionState.catalog -val path = System.getProperty("java.io.tmpdir") -// The generated temp path is not qualified. -
spark git commit: [SPARK-9926] Parallelize partition logic in UnionRDD.
Repository: spark Updated Branches: refs/heads/branch-2.0 19a14e841 -> 80a4bfa4d [SPARK-9926] Parallelize partition logic in UnionRDD. This patch has the new logic from #8512 that uses a parallel collection to compute partitions in UnionRDD. The rest of #8512 added an alternative code path for calculating splits in S3, but that isn't necessary to get the same speedup. The underlying problem wasn't that bulk listing wasn't used, it was that an extra FileStatus was retrieved for each file. The fix was just committed as [HADOOP-12810](https://issues.apache.org/jira/browse/HADOOP-12810). (I think the original commit also used a single prefix to enumerate all paths, but that isn't always helpful and it was removed in later versions so there is no need for SparkS3Utils.) I tested this using the same table that piapiaozhexiu was using. Calculating splits for a 10-day period took 25 seconds with this change and HADOOP-12810, which is on par with the results from #8512. Author: Ryan BlueAuthor: Cheolsoo Park Closes #11242 from rdblue/SPARK-9926-parallelize-union-rdd. (cherry picked from commit 08db491265a3b50e31993ac6aa07c3f0dd08cdbb) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/80a4bfa4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/80a4bfa4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/80a4bfa4 Branch: refs/heads/branch-2.0 Commit: 80a4bfa4d1c86398b90b26c34d8dcbc2355f5a6a Parents: 19a14e8 Author: Ryan Blue Authored: Thu May 5 14:40:37 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:40:46 2016 -0700 -- .../scala/org/apache/spark/rdd/UnionRDD.scala | 18 +- .../scala/org/apache/spark/rdd/RDDSuite.scala | 17 + 2 files changed, 34 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/80a4bfa4/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala index 66cf436..8171dcc 100644 --- a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala @@ -20,6 +20,8 @@ package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer +import scala.collection.parallel.ForkJoinTaskSupport +import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} @@ -62,8 +64,22 @@ class UnionRDD[T: ClassTag]( var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies + // visible for testing + private[spark] val isPartitionListingParallel: Boolean = +rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) + + @transient private lazy val partitionEvalTaskSupport = + new ForkJoinTaskSupport(new ForkJoinPool(8)) + override def getPartitions: Array[Partition] = { -val array = new Array[Partition](rdds.map(_.partitions.length).sum) +val parRDDs = if (isPartitionListingParallel) { + val parArray = rdds.par + parArray.tasksupport = partitionEvalTaskSupport + parArray +} else { + rdds +} +val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) http://git-wip-us.apache.org/repos/asf/spark/blob/80a4bfa4/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index a663dab..979fb42 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -116,6 +116,23 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext { assert(sc.union(Seq(nums, nums)).collect().toList === List(1, 2, 3, 4, 1, 2, 3, 4)) } + test("SparkContext.union parallel partition listing") { +val nums1 = sc.makeRDD(Array(1, 2, 3, 4), 2) +val nums2 = sc.makeRDD(Array(5, 6, 7, 8), 2) +val serialUnion = sc.union(nums1, nums2) +val expected = serialUnion.collect().toList + +assert(serialUnion.asInstanceOf[UnionRDD[Int]].isPartitionListingParallel === false) + +sc.conf.set("spark.rdd.parallelListingThreshold", "1")
spark git commit: [SPARK-15158][CORE] downgrade shouldRollover message to debug level
Repository: spark Updated Branches: refs/heads/master 2c170dd3d -> 5c47db065 [SPARK-15158][CORE] downgrade shouldRollover message to debug level ## What changes were proposed in this pull request? set log level to debug when check shouldRollover ## How was this patch tested? It's tested manually. Author: dependCloses #12931 from depend/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c47db06 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c47db06 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c47db06 Branch: refs/heads/master Commit: 5c47db06570e65d3f5544d6f26bbdf893e275b94 Parents: 2c170dd Author: depend Authored: Thu May 5 14:39:35 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:39:35 2016 -0700 -- .../main/scala/org/apache/spark/util/logging/RollingPolicy.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5c47db06/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala b/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala index 6e80db2..5c4238c 100644 --- a/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala +++ b/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala @@ -113,7 +113,7 @@ private[spark] class SizeBasedRollingPolicy( /** Should rollover if the next set of bytes is going to exceed the size limit */ def shouldRollover(bytesToBeWritten: Long): Boolean = { -logInfo(s"$bytesToBeWritten + $bytesWrittenSinceRollover > $rolloverSizeBytes") +logDebug(s"$bytesToBeWritten + $bytesWrittenSinceRollover > $rolloverSizeBytes") bytesToBeWritten + bytesWrittenSinceRollover > rolloverSizeBytes } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15158][CORE] downgrade shouldRollover message to debug level
Repository: spark Updated Branches: refs/heads/branch-2.0 8b4ab590c -> 19a14e841 [SPARK-15158][CORE] downgrade shouldRollover message to debug level ## What changes were proposed in this pull request? set log level to debug when check shouldRollover ## How was this patch tested? It's tested manually. Author: dependCloses #12931 from depend/master. (cherry picked from commit 5c47db06570e65d3f5544d6f26bbdf893e275b94) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/19a14e84 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/19a14e84 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/19a14e84 Branch: refs/heads/branch-2.0 Commit: 19a14e8417cf4ced0dd0fce863d3f4a0bcf414aa Parents: 8b4ab59 Author: depend Authored: Thu May 5 14:39:35 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:39:45 2016 -0700 -- .../main/scala/org/apache/spark/util/logging/RollingPolicy.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/19a14e84/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala b/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala index 6e80db2..5c4238c 100644 --- a/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala +++ b/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala @@ -113,7 +113,7 @@ private[spark] class SizeBasedRollingPolicy( /** Should rollover if the next set of bytes is going to exceed the size limit */ def shouldRollover(bytesToBeWritten: Long): Boolean = { -logInfo(s"$bytesToBeWritten + $bytesWrittenSinceRollover > $rolloverSizeBytes") +logDebug(s"$bytesToBeWritten + $bytesWrittenSinceRollover > $rolloverSizeBytes") bytesToBeWritten + bytesWrittenSinceRollover > rolloverSizeBytes } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9926] Parallelize partition logic in UnionRDD.
Repository: spark Updated Branches: refs/heads/master 5c47db065 -> 08db49126 [SPARK-9926] Parallelize partition logic in UnionRDD. This patch has the new logic from #8512 that uses a parallel collection to compute partitions in UnionRDD. The rest of #8512 added an alternative code path for calculating splits in S3, but that isn't necessary to get the same speedup. The underlying problem wasn't that bulk listing wasn't used, it was that an extra FileStatus was retrieved for each file. The fix was just committed as [HADOOP-12810](https://issues.apache.org/jira/browse/HADOOP-12810). (I think the original commit also used a single prefix to enumerate all paths, but that isn't always helpful and it was removed in later versions so there is no need for SparkS3Utils.) I tested this using the same table that piapiaozhexiu was using. Calculating splits for a 10-day period took 25 seconds with this change and HADOOP-12810, which is on par with the results from #8512. Author: Ryan BlueAuthor: Cheolsoo Park Closes #11242 from rdblue/SPARK-9926-parallelize-union-rdd. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/08db4912 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/08db4912 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/08db4912 Branch: refs/heads/master Commit: 08db491265a3b50e31993ac6aa07c3f0dd08cdbb Parents: 5c47db0 Author: Ryan Blue Authored: Thu May 5 14:40:37 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:40:37 2016 -0700 -- .../scala/org/apache/spark/rdd/UnionRDD.scala | 18 +- .../scala/org/apache/spark/rdd/RDDSuite.scala | 17 + 2 files changed, 34 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/08db4912/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala index 66cf436..8171dcc 100644 --- a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala @@ -20,6 +20,8 @@ package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer +import scala.collection.parallel.ForkJoinTaskSupport +import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} @@ -62,8 +64,22 @@ class UnionRDD[T: ClassTag]( var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies + // visible for testing + private[spark] val isPartitionListingParallel: Boolean = +rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) + + @transient private lazy val partitionEvalTaskSupport = + new ForkJoinTaskSupport(new ForkJoinPool(8)) + override def getPartitions: Array[Partition] = { -val array = new Array[Partition](rdds.map(_.partitions.length).sum) +val parRDDs = if (isPartitionListingParallel) { + val parArray = rdds.par + parArray.tasksupport = partitionEvalTaskSupport + parArray +} else { + rdds +} +val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) http://git-wip-us.apache.org/repos/asf/spark/blob/08db4912/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index a663dab..979fb42 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -116,6 +116,23 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext { assert(sc.union(Seq(nums, nums)).collect().toList === List(1, 2, 3, 4, 1, 2, 3, 4)) } + test("SparkContext.union parallel partition listing") { +val nums1 = sc.makeRDD(Array(1, 2, 3, 4), 2) +val nums2 = sc.makeRDD(Array(5, 6, 7, 8), 2) +val serialUnion = sc.union(nums1, nums2) +val expected = serialUnion.collect().toList + +assert(serialUnion.asInstanceOf[UnionRDD[Int]].isPartitionListingParallel === false) + +sc.conf.set("spark.rdd.parallelListingThreshold", "1") +val parallelUnion = sc.union(nums1, nums2) +val actual = parallelUnion.collect().toList +
spark git commit: [SPARK-15072][SQL][REPL][EXAMPLES] Remove SparkSession.withHiveSupport
Repository: spark Updated Branches: refs/heads/branch-2.0 fe268ee1e -> 59fa480b6 [SPARK-15072][SQL][REPL][EXAMPLES] Remove SparkSession.withHiveSupport ## What changes were proposed in this pull request? Removing the `withHiveSupport` method of `SparkSession`, instead use `enableHiveSupport` ## How was this patch tested? ran tests locally Author: Sandeep SinghCloses #12851 from techaddict/SPARK-15072. (cherry picked from commit ed6f3f8a5f3a6bf7c53e13c2798de398c9a526a6) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/59fa480b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/59fa480b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/59fa480b Branch: refs/heads/branch-2.0 Commit: 59fa480b6d014369d58db9c5b77d82ddd17ee2a2 Parents: fe268ee Author: Sandeep Singh Authored: Thu May 5 14:35:15 2016 -0700 Committer: Andrew Or Committed: Thu May 5 14:35:23 2016 -0700 -- .../spark/examples/sql/hive/HiveFromSpark.scala | 14 +- .../scala/org/apache/spark/sql/SparkSession.scala | 13 - .../spark/sql/hive/thriftserver/SparkSQLEnv.scala | 10 ++ .../apache/spark/sql/hive/HiveSparkSubmitSuite.scala | 7 +-- 4 files changed, 20 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/59fa480b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala index ff33091..a15cf5d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala @@ -36,15 +36,19 @@ object HiveFromSpark { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("HiveFromSpark") -val sc = new SparkContext(sparkConf) // A hive context adds support for finding tables in the MetaStore and writing queries // using HiveQL. Users who do not have an existing Hive deployment can still create a // HiveContext. When not configured by the hive-site.xml, the context automatically // creates metastore_db and warehouse in the current directory. -val sparkSession = SparkSession.withHiveSupport(sc) -import sparkSession.implicits._ -import sparkSession.sql +val spark = SparkSession.builder + .config(sparkConf) + .enableHiveSupport() + .getOrCreate() +val sc = spark.sparkContext + +import spark.implicits._ +import spark.sql sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") sql(s"LOAD DATA LOCAL INPATH '${kv1File.getAbsolutePath}' INTO TABLE src") @@ -74,7 +78,7 @@ object HiveFromSpark { println("Result of SELECT *:") sql("SELECT * FROM records r JOIN src s ON r.key = s.key").collect().foreach(println) -sc.stop() +spark.stop() } } // scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/59fa480b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index aa7c335..9ed3756 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -816,17 +816,4 @@ object SparkSession { } } - /** - * Create a new [[SparkSession]] with a catalog backed by Hive. - */ - def withHiveSupport(sc: SparkContext): SparkSession = { -if (hiveClassesArePresent) { - sc.conf.set(CATALOG_IMPLEMENTATION.key, "hive") - new SparkSession(sc) -} else { - throw new IllegalArgumentException( -"Unable to instantiate SparkSession with Hive support because Hive classes are not found.") -} - } - } http://git-wip-us.apache.org/repos/asf/spark/blob/59fa480b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala -- diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala index 665a44e..8de223f 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala +++
spark git commit: [HOTFIX] Fix MLUtils compile
Repository: spark Updated Branches: refs/heads/branch-2.0 a1887f213 -> 7dc3fb6ae [HOTFIX] Fix MLUtils compile Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7dc3fb6a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7dc3fb6a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7dc3fb6a Branch: refs/heads/branch-2.0 Commit: 7dc3fb6ae44ba9863eb59c2724c73201c46e5213 Parents: a1887f2 Author: Andrew OrAuthored: Thu May 5 16:51:06 2016 -0700 Committer: Andrew Or Committed: Thu May 5 16:51:52 2016 -0700 -- mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7dc3fb6a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 86ce970..f0346e6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -119,7 +119,7 @@ object MLUtils { previous = current i += 1 } -(label, indices, values) +(label, indices.toArray, values.toArray) } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [HOTFIX] Fix MLUtils compile
Repository: spark Updated Branches: refs/heads/master bbb777343 -> 7f5922aa4 [HOTFIX] Fix MLUtils compile Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f5922aa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f5922aa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f5922aa Branch: refs/heads/master Commit: 7f5922aa4a810a0b9cc783956a8b7aa3dad86a0a Parents: bbb7773 Author: Andrew OrAuthored: Thu May 5 16:51:06 2016 -0700 Committer: Andrew Or Committed: Thu May 5 16:51:06 2016 -0700 -- mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7f5922aa/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 86ce970..f0346e6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -119,7 +119,7 @@ object MLUtils { previous = current i += 1 } -(label, indices, values) +(label, indices.toArray, values.toArray) } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12837][CORE] reduce network IO for accumulators
Repository: spark Updated Branches: refs/heads/master 0b9cae424 -> bcfee153b [SPARK-12837][CORE] reduce network IO for accumulators Sending un-updated accumulators back to driver makes no sense, as merging a zero value accumulator is a no-op. We should only send back updated accumulators, to save network IO. new test in `TaskContextSuite` Author: Wenchen FanCloses #12899 from cloud-fan/acc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bcfee153 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bcfee153 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bcfee153 Branch: refs/heads/master Commit: bcfee153b1cacfe617e602f3b72c0877e0bdf1f7 Parents: 0b9cae4 Author: Wenchen Fan Authored: Tue May 10 11:16:31 2016 -0700 Committer: Andrew Or Committed: Tue May 10 11:16:56 2016 -0700 -- .../org/apache/spark/executor/TaskMetrics.scala | 2 +- .../scala/org/apache/spark/scheduler/Task.scala | 9 - .../org/apache/spark/util/AccumulatorV2.scala | 4 +-- .../spark/scheduler/TaskContextSuite.scala | 37 .../spark/sql/execution/metric/SQLMetrics.scala | 6 ++-- .../spark/sql/execution/ui/SQLListener.scala| 2 +- .../sql/execution/ui/SQLListenerSuite.scala | 12 +++ 7 files changed, 51 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bcfee153/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala index 7f4652c..1893167 100644 --- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala @@ -218,7 +218,7 @@ class TaskMetrics private[spark] () extends Serializable { /** * External accumulators registered with this task. */ - @transient private lazy val externalAccums = new ArrayBuffer[AccumulatorV2[_, _]] + @transient private[spark] lazy val externalAccums = new ArrayBuffer[AccumulatorV2[_, _]] private[spark] def registerAccumulator(a: AccumulatorV2[_, _]): Unit = { externalAccums += a http://git-wip-us.apache.org/repos/asf/spark/blob/bcfee153/core/src/main/scala/org/apache/spark/scheduler/Task.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala index 95bcc7b..15f863b 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala @@ -155,7 +155,14 @@ private[spark] abstract class Task[T]( */ def collectAccumulatorUpdates(taskFailed: Boolean = false): Seq[AccumulatorV2[_, _]] = { if (context != null) { - context.taskMetrics.accumulators().filter { a => !taskFailed || a.countFailedValues } + context.taskMetrics.internalAccums.filter { a => +// RESULT_SIZE accumulator is always zero at executor, we need to send it back as its +// value will be updated at driver side. +// Note: internal accumulators representing task metrics always count failed values +!a.isZero || a.name == Some(InternalAccumulator.RESULT_SIZE) + // zero value external accumulators may still be useful, e.g. SQLMetrics, we should not filter + // them out. + } ++ context.taskMetrics.externalAccums.filter(a => !taskFailed || a.countFailedValues) } else { Seq.empty } http://git-wip-us.apache.org/repos/asf/spark/blob/bcfee153/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala index d8f380e..c487903 100644 --- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala +++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala @@ -256,7 +256,7 @@ class LongAccumulator extends AccumulatorV2[jl.Long, jl.Long] { * Adds v to the accumulator, i.e. increment sum by v and count by 1. * @since 2.0.0 */ - override def isZero: Boolean = _count == 0L + override def isZero: Boolean = _sum == 0L && _count == 0 override def copyAndReset(): LongAccumulator = new LongAccumulator @@ -321,7 +321,7 @@ class DoubleAccumulator extends AccumulatorV2[jl.Double, jl.Double] { private[this] var _sum = 0.0 private[this] var _count = 0L - override def isZero: Boolean = _count == 0L + override def isZero:
spark git commit: [SPARK-12837][CORE] reduce network IO for accumulators
Repository: spark Updated Branches: refs/heads/branch-2.0 af12b0a50 -> 19a9c23c2 [SPARK-12837][CORE] reduce network IO for accumulators Sending un-updated accumulators back to driver makes no sense, as merging a zero value accumulator is a no-op. We should only send back updated accumulators, to save network IO. new test in `TaskContextSuite` Author: Wenchen FanCloses #12899 from cloud-fan/acc. (cherry picked from commit bcfee153b1cacfe617e602f3b72c0877e0bdf1f7) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/19a9c23c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/19a9c23c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/19a9c23c Branch: refs/heads/branch-2.0 Commit: 19a9c23c2d2ee6d16d8078db0730602ae5a591ed Parents: af12b0a Author: Wenchen Fan Authored: Tue May 10 11:16:31 2016 -0700 Committer: Andrew Or Committed: Tue May 10 11:17:09 2016 -0700 -- .../org/apache/spark/executor/TaskMetrics.scala | 2 +- .../scala/org/apache/spark/scheduler/Task.scala | 9 - .../org/apache/spark/util/AccumulatorV2.scala | 4 +-- .../spark/scheduler/TaskContextSuite.scala | 37 .../spark/sql/execution/metric/SQLMetrics.scala | 6 ++-- .../spark/sql/execution/ui/SQLListener.scala| 2 +- .../sql/execution/ui/SQLListenerSuite.scala | 12 +++ 7 files changed, 51 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/19a9c23c/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala index 7f4652c..1893167 100644 --- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala @@ -218,7 +218,7 @@ class TaskMetrics private[spark] () extends Serializable { /** * External accumulators registered with this task. */ - @transient private lazy val externalAccums = new ArrayBuffer[AccumulatorV2[_, _]] + @transient private[spark] lazy val externalAccums = new ArrayBuffer[AccumulatorV2[_, _]] private[spark] def registerAccumulator(a: AccumulatorV2[_, _]): Unit = { externalAccums += a http://git-wip-us.apache.org/repos/asf/spark/blob/19a9c23c/core/src/main/scala/org/apache/spark/scheduler/Task.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala index 95bcc7b..15f863b 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala @@ -155,7 +155,14 @@ private[spark] abstract class Task[T]( */ def collectAccumulatorUpdates(taskFailed: Boolean = false): Seq[AccumulatorV2[_, _]] = { if (context != null) { - context.taskMetrics.accumulators().filter { a => !taskFailed || a.countFailedValues } + context.taskMetrics.internalAccums.filter { a => +// RESULT_SIZE accumulator is always zero at executor, we need to send it back as its +// value will be updated at driver side. +// Note: internal accumulators representing task metrics always count failed values +!a.isZero || a.name == Some(InternalAccumulator.RESULT_SIZE) + // zero value external accumulators may still be useful, e.g. SQLMetrics, we should not filter + // them out. + } ++ context.taskMetrics.externalAccums.filter(a => !taskFailed || a.countFailedValues) } else { Seq.empty } http://git-wip-us.apache.org/repos/asf/spark/blob/19a9c23c/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala index d8f380e..c487903 100644 --- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala +++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala @@ -256,7 +256,7 @@ class LongAccumulator extends AccumulatorV2[jl.Long, jl.Long] { * Adds v to the accumulator, i.e. increment sum by v and count by 1. * @since 2.0.0 */ - override def isZero: Boolean = _count == 0L + override def isZero: Boolean = _sum == 0L && _count == 0 override def copyAndReset(): LongAccumulator = new LongAccumulator @@ -321,7 +321,7 @@ class DoubleAccumulator extends AccumulatorV2[jl.Double, jl.Double] {
[04/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index 1742df3..c31dffe 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -27,16 +27,16 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { test("allowComments off") { val str = """{'name': /* hello */ 'Reynold Xin'}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.json(rdd) assert(df.schema.head.name == "_corrupt_record") } test("allowComments on") { val str = """{'name': /* hello */ 'Reynold Xin'}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.option("allowComments", "true").json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.option("allowComments", "true").json(rdd) assert(df.schema.head.name == "name") assert(df.first().getString(0) == "Reynold Xin") @@ -44,16 +44,16 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { test("allowSingleQuotes off") { val str = """{'name': 'Reynold Xin'}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.option("allowSingleQuotes", "false").json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.option("allowSingleQuotes", "false").json(rdd) assert(df.schema.head.name == "_corrupt_record") } test("allowSingleQuotes on") { val str = """{'name': 'Reynold Xin'}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.json(rdd) assert(df.schema.head.name == "name") assert(df.first().getString(0) == "Reynold Xin") @@ -61,16 +61,16 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { test("allowUnquotedFieldNames off") { val str = """{name: 'Reynold Xin'}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.json(rdd) assert(df.schema.head.name == "_corrupt_record") } test("allowUnquotedFieldNames on") { val str = """{name: 'Reynold Xin'}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.option("allowUnquotedFieldNames", "true").json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.option("allowUnquotedFieldNames", "true").json(rdd) assert(df.schema.head.name == "name") assert(df.first().getString(0) == "Reynold Xin") @@ -78,16 +78,16 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { test("allowNumericLeadingZeros off") { val str = """{"age": 0018}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.json(rdd) assert(df.schema.head.name == "_corrupt_record") } test("allowNumericLeadingZeros on") { val str = """{"age": 0018}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.option("allowNumericLeadingZeros", "true").json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.option("allowNumericLeadingZeros", "true").json(rdd) assert(df.schema.head.name == "age") assert(df.first().getLong(0) == 18) @@ -97,16 +97,16 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { // JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS. ignore("allowNonNumericNumbers off") { val str = """{"age": NaN}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.json(rdd) assert(df.schema.head.name == "_corrupt_record") } ignore("allowNonNumericNumbers on") { val str = """{"age": NaN}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.option("allowNonNumericNumbers", "true").json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df =
[06/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaDatasetAggregatorSuiteBase.java -- diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaDatasetAggregatorSuiteBase.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaDatasetAggregatorSuiteBase.java index 7863177..059c2d9 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaDatasetAggregatorSuiteBase.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaDatasetAggregatorSuiteBase.java @@ -26,36 +26,30 @@ import scala.Tuple2; import org.junit.After; import org.junit.Before; -import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.KeyValueGroupedDataset; -import org.apache.spark.sql.test.TestSQLContext; +import org.apache.spark.sql.test.TestSparkSession; /** * Common test base shared across this and Java8DatasetAggregatorSuite. */ public class JavaDatasetAggregatorSuiteBase implements Serializable { - protected transient JavaSparkContext jsc; - protected transient TestSQLContext context; + private transient TestSparkSession spark; @Before public void setUp() { // Trigger static initializer of TestData -SparkContext sc = new SparkContext("local[*]", "testing"); -jsc = new JavaSparkContext(sc); -context = new TestSQLContext(sc); -context.loadTestData(); +spark = new TestSparkSession(); +spark.loadTestData(); } @After public void tearDown() { -context.sparkContext().stop(); -context = null; -jsc = null; +spark.stop(); +spark = null; } protectedTuple2 tuple2(T1 t1, T2 t2) { @@ -66,7 +60,7 @@ public class JavaDatasetAggregatorSuiteBase implements Serializable { Encoder > encoder = Encoders.tuple(Encoders.STRING(), Encoders.INT()); List > data = Arrays.asList(tuple2("a", 1), tuple2("a", 2), tuple2("b", 3)); -Dataset > ds = context.createDataset(data, encoder); +Dataset > ds = spark.createDataset(data, encoder); return ds.groupByKey( new MapFunction , String>() { http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java -- diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java index 9e65158..d0435e4 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java @@ -19,14 +19,16 @@ package test.org.apache.spark.sql.sources; import java.io.File; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; -import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.*; @@ -37,8 +39,8 @@ import org.apache.spark.util.Utils; public class JavaSaveLoadSuite { - private transient JavaSparkContext sc; - private transient SQLContext sqlContext; + private transient SparkSession spark; + private transient JavaSparkContext jsc; File path; Dataset df; @@ -52,9 +54,11 @@ public class JavaSaveLoadSuite { @Before public void setUp() throws IOException { -SparkContext _sc = new SparkContext("local[*]", "testing"); -sqlContext = new SQLContext(_sc); -sc = new JavaSparkContext(_sc); +spark = SparkSession.builder() + .master("local[*]") + .appName("testing") + .getOrCreate(); +jsc = new JavaSparkContext(spark.sparkContext()); path = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource").getCanonicalFile(); @@ -66,16 +70,15 @@ public class JavaSaveLoadSuite { for (int i = 0; i < 10; i++) { jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}"); } -JavaRDD rdd = sc.parallelize(jsonObjects); -df = sqlContext.read().json(rdd); +JavaRDD rdd = jsc.parallelize(jsonObjects); +df = spark.read().json(rdd); df.registerTempTable("jsonTable"); } @After public void tearDown() { -sqlContext.sparkContext().stop(); -sqlContext = null; -sc = null; +
[05/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 1ff288c..e401abe 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -57,7 +57,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { test("show functions") { def getFunctions(pattern: String): Seq[Row] = { - StringUtils.filterPattern(sqlContext.sessionState.functionRegistry.listFunction(), pattern) + StringUtils.filterPattern(spark.sessionState.functionRegistry.listFunction(), pattern) .map(Row(_)) } checkAnswer(sql("SHOW functions"), getFunctions("*")) @@ -88,7 +88,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("SPARK-14415: All functions should have own descriptions") { -for (f <- sqlContext.sessionState.functionRegistry.listFunction()) { +for (f <- spark.sessionState.functionRegistry.listFunction()) { if (!Seq("cube", "grouping", "grouping_id", "rollup", "window").contains(f)) { checkKeywordsNotExist(sql(s"describe function `$f`"), "N/A.") } @@ -102,7 +102,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { (43, 81, 24) ).toDF("a", "b", "c").registerTempTable("cachedData") -sqlContext.cacheTable("cachedData") +spark.catalog.cacheTable("cachedData") checkAnswer( sql("SELECT t1.b FROM cachedData, cachedData t1 GROUP BY t1.b"), Row(0) :: Row(81) :: Nil) @@ -193,7 +193,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("grouping on nested fields") { -sqlContext.read.json(sparkContext.parallelize( +spark.read.json(sparkContext.parallelize( """{"nested": {"attribute": 1}, "value": 2}""" :: Nil)) .registerTempTable("rows") @@ -211,7 +211,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("SPARK-6201 IN type conversion") { -sqlContext.read.json( +spark.read.json( sparkContext.parallelize( Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}"))) .registerTempTable("d") @@ -222,7 +222,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("SPARK-11226 Skip empty line in json file") { -sqlContext.read.json( +spark.read.json( sparkContext.parallelize( Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}", ""))) .registerTempTable("d") @@ -258,9 +258,9 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { test("aggregation with codegen") { // Prepare a table that we can group some rows. -sqlContext.table("testData") - .union(sqlContext.table("testData")) - .union(sqlContext.table("testData")) +spark.table("testData") + .union(spark.table("testData")) + .union(spark.table("testData")) .registerTempTable("testData3x") try { @@ -333,7 +333,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { "SELECT sum('a'), avg('a'), count(null) FROM testData", Row(null, null, 0) :: Nil) } finally { - sqlContext.dropTempTable("testData3x") + spark.catalog.dropTempTable("testData3x") } } @@ -1041,7 +1041,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("SET commands semantics using sql()") { -sqlContext.conf.clear() +spark.wrapped.conf.clear() val testKey = "test.key.0" val testVal = "test.val.0" val nonexistentKey = "nonexistent" @@ -1082,17 +1082,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { sql(s"SET $nonexistentKey"), Row(nonexistentKey, "") ) -sqlContext.conf.clear() +spark.wrapped.conf.clear() } test("SET commands with illegal or inappropriate argument") { -sqlContext.conf.clear() +spark.wrapped.conf.clear() // Set negative mapred.reduce.tasks for automatically determining // the number of reducers is not supported intercept[IllegalArgumentException](sql(s"SET mapred.reduce.tasks=-1")) intercept[IllegalArgumentException](sql(s"SET mapred.reduce.tasks=-01")) intercept[IllegalArgumentException](sql(s"SET mapred.reduce.tasks=-2")) -sqlContext.conf.clear() +spark.wrapped.conf.clear() } test("apply schema") { @@ -1110,7 +1110,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { Row(values(0).toInt, values(1), values(2).toBoolean, v4) } -val df1 = sqlContext.createDataFrame(rowRDD1, schema1) +val df1 = spark.createDataFrame(rowRDD1, schema1) df1.registerTempTable("applySchema1") checkAnswer(
[02/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index a9b1970..a2decad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -29,11 +29,11 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext { val path = Utils.createTempDir() path.delete() -val df = sqlContext.range(100).select($"id", lit(1).as("data")) +val df = spark.range(100).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( - sqlContext.read.load(path.getCanonicalPath), + spark.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) @@ -43,12 +43,12 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext { val path = Utils.createTempDir() path.delete() -val base = sqlContext.range(100) +val base = spark.range(100) val df = base.union(base).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( - sqlContext.read.load(path.getCanonicalPath), + spark.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq ++ (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) @@ -58,7 +58,7 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext { withTempPath { f => val path = f.getAbsolutePath Seq(1 -> "a").toDF("i", "j").write.partitionBy("i").parquet(path) - assert(sqlContext.read.parquet(path).schema.map(_.name) == Seq("j", "i")) + assert(spark.read.parquet(path).schema.map(_.name) == Seq("j", "i")) } } } http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/sql/core/src/test/scala/org/apache/spark/sql/streaming/ContinuousQueryManagerSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ContinuousQueryManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ContinuousQueryManagerSuite.scala index 3d69c8a..a743cdd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ContinuousQueryManagerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ContinuousQueryManagerSuite.scala @@ -41,13 +41,13 @@ class ContinuousQueryManagerSuite extends StreamTest with SharedSQLContext with override val streamingTimeout = 20.seconds before { -assert(sqlContext.streams.active.isEmpty) -sqlContext.streams.resetTerminated() +assert(spark.streams.active.isEmpty) +spark.streams.resetTerminated() } after { -assert(sqlContext.streams.active.isEmpty) -sqlContext.streams.resetTerminated() +assert(spark.streams.active.isEmpty) +spark.streams.resetTerminated() } testQuietly("listing") { @@ -57,26 +57,26 @@ class ContinuousQueryManagerSuite extends StreamTest with SharedSQLContext with withQueriesOn(ds1, ds2, ds3) { queries => require(queries.size === 3) - assert(sqlContext.streams.active.toSet === queries.toSet) + assert(spark.streams.active.toSet === queries.toSet) val (q1, q2, q3) = (queries(0), queries(1), queries(2)) - assert(sqlContext.streams.get(q1.name).eq(q1)) - assert(sqlContext.streams.get(q2.name).eq(q2)) - assert(sqlContext.streams.get(q3.name).eq(q3)) + assert(spark.streams.get(q1.name).eq(q1)) + assert(spark.streams.get(q2.name).eq(q2)) + assert(spark.streams.get(q3.name).eq(q3)) intercept[IllegalArgumentException] { -sqlContext.streams.get("non-existent-name") +spark.streams.get("non-existent-name") } q1.stop() - assert(sqlContext.streams.active.toSet === Set(q2, q3)) + assert(spark.streams.active.toSet === Set(q2, q3)) val ex1 = withClue("no error while getting non-active query") { intercept[IllegalArgumentException] { - sqlContext.streams.get(q1.name) + spark.streams.get(q1.name) } } assert(ex1.getMessage.contains(q1.name), "error does not contain name of query to be fetched") - assert(sqlContext.streams.get(q2.name).eq(q2)) + assert(spark.streams.get(q2.name).eq(q2)) m2.addData(0) // q2 should terminate with error @@ -86,11 +86,11 @@ class ContinuousQueryManagerSuite extends StreamTest with SharedSQLContext with } withClue("no error while getting non-active query") { intercept[IllegalArgumentException] { -
[10/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
[SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites ## What changes were proposed in this pull request? Use SparkSession instead of SQLContext in Scala/Java TestSuites as this PR already very big working Python TestSuites in a diff PR. ## How was this patch tested? Existing tests Author: Sandeep SinghCloses #12907 from techaddict/SPARK-15037. (cherry picked from commit ed0b4070fb50054b1ecf66ff6c32458a4967dfd3) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5bf74b44 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5bf74b44 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5bf74b44 Branch: refs/heads/branch-2.0 Commit: 5bf74b44d9efcb8b0f0c3e7d129bc5ba31419551 Parents: 19a9c23 Author: Sandeep Singh Authored: Tue May 10 11:17:47 2016 -0700 Committer: Andrew Or Committed: Tue May 10 11:17:58 2016 -0700 -- .../org/apache/spark/ml/JavaPipelineSuite.java | 27 +- .../spark/ml/attribute/JavaAttributeSuite.java | 2 +- .../JavaDecisionTreeClassifierSuite.java| 23 +- .../classification/JavaGBTClassifierSuite.java | 18 +- .../JavaLogisticRegressionSuite.java| 49 ++-- ...JavaMultilayerPerceptronClassifierSuite.java | 36 +-- .../ml/classification/JavaNaiveBayesSuite.java | 18 +- .../ml/classification/JavaOneVsRestSuite.java | 90 +++--- .../JavaRandomForestClassifierSuite.java| 26 +- .../spark/ml/clustering/JavaKMeansSuite.java| 26 +- .../spark/ml/feature/JavaBucketizerSuite.java | 20 +- .../apache/spark/ml/feature/JavaDCTSuite.java | 21 +- .../spark/ml/feature/JavaHashingTFSuite.java| 18 +- .../spark/ml/feature/JavaNormalizerSuite.java | 19 +- .../apache/spark/ml/feature/JavaPCASuite.java | 21 +- .../feature/JavaPolynomialExpansionSuite.java | 19 +- .../ml/feature/JavaStandardScalerSuite.java | 17 +- .../ml/feature/JavaStopWordsRemoverSuite.java | 22 +- .../ml/feature/JavaStringIndexerSuite.java | 26 +- .../spark/ml/feature/JavaTokenizerSuite.java| 21 +- .../ml/feature/JavaVectorAssemblerSuite.java| 31 ++- .../ml/feature/JavaVectorIndexerSuite.java | 18 +- .../spark/ml/feature/JavaVectorSlicerSuite.java | 18 +- .../spark/ml/feature/JavaWord2VecSuite.java | 22 +- .../apache/spark/ml/param/JavaParamsSuite.java | 14 +- .../apache/spark/ml/param/JavaTestParams.java | 38 ++- .../JavaDecisionTreeRegressorSuite.java | 18 +- .../ml/regression/JavaGBTRegressorSuite.java| 18 +- .../regression/JavaLinearRegressionSuite.java | 25 +- .../JavaRandomForestRegressorSuite.java | 28 +- .../source/libsvm/JavaLibSVMRelationSuite.java | 18 +- .../ml/tuning/JavaCrossValidatorSuite.java | 18 +- .../spark/ml/util/IdentifiableSuite.scala | 1 + .../ml/util/JavaDefaultReadWriteSuite.java | 21 +- .../JavaLogisticRegressionSuite.java| 35 ++- .../classification/JavaNaiveBayesSuite.java | 25 +- .../mllib/classification/JavaSVMSuite.java | 32 ++- .../clustering/JavaBisectingKMeansSuite.java| 27 +- .../clustering/JavaGaussianMixtureSuite.java| 20 +- .../spark/mllib/clustering/JavaKMeansSuite.java | 23 +- .../spark/mllib/clustering/JavaLDASuite.java| 37 +-- .../clustering/JavaStreamingKMeansSuite.java| 3 +- .../evaluation/JavaRankingMetricsSuite.java | 21 +- .../spark/mllib/feature/JavaTfIdfSuite.java | 22 +- .../spark/mllib/feature/JavaWord2VecSuite.java | 19 +- .../mllib/fpm/JavaAssociationRulesSuite.java| 23 +- .../spark/mllib/fpm/JavaFPGrowthSuite.java | 29 +- .../spark/mllib/fpm/JavaPrefixSpanSuite.java| 26 +- .../spark/mllib/linalg/JavaMatricesSuite.java | 278 ++- .../spark/mllib/linalg/JavaVectorsSuite.java| 7 +- .../spark/mllib/random/JavaRandomRDDsSuite.java | 136 - .../mllib/recommendation/JavaALSSuite.java | 64 +++-- .../regression/JavaIsotonicRegressionSuite.java | 22 +- .../spark/mllib/regression/JavaLassoSuite.java | 32 ++- .../regression/JavaLinearRegressionSuite.java | 42 +-- .../regression/JavaRidgeRegressionSuite.java| 22 +- .../spark/mllib/stat/JavaStatisticsSuite.java | 32 ++- .../spark/mllib/tree/JavaDecisionTreeSuite.java | 24 +- .../org/apache/spark/ml/PipelineSuite.scala | 2 +- .../ml/classification/ClassifierSuite.scala | 4 +- .../DecisionTreeClassifierSuite.scala | 4 +- .../ml/classification/GBTClassifierSuite.scala | 6 +- .../LogisticRegressionSuite.scala | 12 +- .../MultilayerPerceptronClassifierSuite.scala | 8 +- .../ml/classification/NaiveBayesSuite.scala | 12 +- .../ml/classification/OneVsRestSuite.scala |
[07/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/ed0b4070/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 8e7e000..125ad02 100755 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{Dataset, Row} object StopWordsRemoverSuite extends SparkFunSuite { def testStopWordsRemover(t: StopWordsRemover, dataset: Dataset[_]): Unit = { @@ -42,7 +42,7 @@ class StopWordsRemoverSuite val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("test", "test"), Seq("test", "test")), (Seq("a", "b", "c", "d"), Seq("b", "c")), (Seq("a", "the", "an"), Seq()), @@ -60,7 +60,7 @@ class StopWordsRemoverSuite .setInputCol("raw") .setOutputCol("filtered") .setStopWords(stopWords) -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("test", "test"), Seq()), (Seq("a", "b", "c", "d"), Seq("b", "c", "d")), (Seq("a", "the", "an"), Seq()), @@ -77,7 +77,7 @@ class StopWordsRemoverSuite .setInputCol("raw") .setOutputCol("filtered") .setCaseSensitive(true) -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("A"), Seq("A")), (Seq("The", "the"), Seq("The")) )).toDF("raw", "expected") @@ -98,7 +98,7 @@ class StopWordsRemoverSuite .setInputCol("raw") .setOutputCol("filtered") .setStopWords(stopWords) -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("acaba", "ama", "biri"), Seq()), (Seq("hep", "her", "scala"), Seq("scala")) )).toDF("raw", "expected") @@ -112,7 +112,7 @@ class StopWordsRemoverSuite .setInputCol("raw") .setOutputCol("filtered") .setStopWords(stopWords.toArray) -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("python", "scala", "a"), Seq("python", "scala", "a")), (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift")) )).toDF("raw", "expected") @@ -126,7 +126,7 @@ class StopWordsRemoverSuite .setInputCol("raw") .setOutputCol("filtered") .setStopWords(stopWords.toArray) -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("python", "scala", "a"), Seq()), (Seq("Python", "Scala", "swift"), Seq("swift")) )).toDF("raw", "expected") @@ -148,7 +148,7 @@ class StopWordsRemoverSuite val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol(outputCol) -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("The", "the", "swift"), Seq("swift")) )).toDF("raw", outputCol) http://git-wip-us.apache.org/repos/asf/spark/blob/ed0b4070/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala index d0f3cdc..c221d4a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala @@ -39,7 +39,7 @@ class StringIndexerSuite test("StringIndexer") { val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) -val df = sqlContext.createDataFrame(data).toDF("id", "label") +val df = spark.createDataFrame(data).toDF("id", "label") val indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex") @@ -63,8 +63,8 @@ class StringIndexerSuite test("StringIndexerUnseen") { val data = sc.parallelize(Seq((0, "a"), (1, "b"), (4, "b")), 2) val data2 = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c")), 2) -val df = sqlContext.createDataFrame(data).toDF("id", "label") -val df2 = sqlContext.createDataFrame(data2).toDF("id", "label") +val df = spark.createDataFrame(data).toDF("id", "label") +val df2 =
[08/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/ed0b4070/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java -- diff --git a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java index 3db9b39..8b05675 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java @@ -32,15 +32,17 @@ import org.junit.Test; import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; public class JavaIsotonicRegressionSuite implements Serializable { - private transient JavaSparkContext sc; + private transient SparkSession spark; + private transient JavaSparkContext jsc; private static List> generateIsotonicInput(double[] labels) { List > input = new ArrayList<>(labels.length); for (int i = 1; i <= labels.length; i++) { - input.add(new Tuple3<>(labels[i-1], (double) i, 1.0)); + input.add(new Tuple3<>(labels[i - 1], (double) i, 1.0)); } return input; @@ -48,20 +50,24 @@ public class JavaIsotonicRegressionSuite implements Serializable { private IsotonicRegressionModel runIsotonicRegression(double[] labels) { JavaRDD > trainRDD = - sc.parallelize(generateIsotonicInput(labels), 2).cache(); + jsc.parallelize(generateIsotonicInput(labels), 2).cache(); return new IsotonicRegression().run(trainRDD); } @Before public void setUp() { -sc = new JavaSparkContext("local", "JavaLinearRegressionSuite"); +spark = SparkSession.builder() + .master("local") + .appName("JavaLinearRegressionSuite") + .getOrCreate(); +jsc = new JavaSparkContext(spark.sparkContext()); } @After public void tearDown() { -sc.stop(); -sc = null; +spark.stop(); +spark = null; } @Test @@ -70,7 +76,7 @@ public class JavaIsotonicRegressionSuite implements Serializable { runIsotonicRegression(new double[]{1, 2, 3, 3, 1, 6, 7, 8, 11, 9, 10, 12}); Assert.assertArrayEquals( - new double[] {1, 2, 7.0/3, 7.0/3, 6, 7, 8, 10, 10, 12}, model.predictions(), 1.0e-14); + new double[]{1, 2, 7.0 / 3, 7.0 / 3, 6, 7, 8, 10, 10, 12}, model.predictions(), 1.0e-14); } @Test @@ -78,7 +84,7 @@ public class JavaIsotonicRegressionSuite implements Serializable { IsotonicRegressionModel model = runIsotonicRegression(new double[]{1, 2, 3, 3, 1, 6, 7, 8, 11, 9, 10, 12}); -JavaDoubleRDD testRDD = sc.parallelizeDoubles(Arrays.asList(0.0, 1.0, 9.5, 12.0, 13.0)); +JavaDoubleRDD testRDD = jsc.parallelizeDoubles(Arrays.asList(0.0, 1.0, 9.5, 12.0, 13.0)); List predictions = model.predict(testRDD).collect(); Assert.assertEquals(1.0, predictions.get(0).doubleValue(), 1.0e-14); http://git-wip-us.apache.org/repos/asf/spark/blob/ed0b4070/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLassoSuite.java -- diff --git a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLassoSuite.java b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLassoSuite.java index 8950b48..098bac3 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLassoSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLassoSuite.java @@ -28,24 +28,30 @@ import org.junit.Test; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.util.LinearDataGenerator; +import org.apache.spark.sql.SparkSession; public class JavaLassoSuite implements Serializable { - private transient JavaSparkContext sc; + private transient SparkSession spark; + private transient JavaSparkContext jsc; @Before public void setUp() { -sc = new JavaSparkContext("local", "JavaLassoSuite"); +spark = SparkSession.builder() + .master("local") + .appName("JavaLassoSuite") + .getOrCreate(); +jsc = new JavaSparkContext(spark.sparkContext()); } @After public void tearDown() { -sc.stop(); -sc = null; +spark.stop(); +spark = null; } int validatePrediction(List validationData, LassoModel model) { int numAccurate = 0; -for (LabeledPoint point: validationData) { +for (LabeledPoint point : validationData) { Double prediction = model.predict(point.features()); // A prediction is off if the prediction is more than 0.5 away from expected value. if (Math.abs(prediction - point.label()) <= 0.5) { @@
[03/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/ed0b4070/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala index cef541f..373d3a3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala @@ -21,9 +21,9 @@ import java.io.File import scala.collection.JavaConverters._ import scala.util.Try -import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.SparkConf import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession import org.apache.spark.util.{Benchmark, Utils} /** @@ -34,12 +34,16 @@ import org.apache.spark.util.{Benchmark, Utils} object ParquetReadBenchmark { val conf = new SparkConf() conf.set("spark.sql.parquet.compression.codec", "snappy") - val sc = new SparkContext("local[1]", "test-sql-context", conf) - val sqlContext = new SQLContext(sc) + + val spark = SparkSession.builder +.master("local[1]") +.appName("test-sql-context") +.config(conf) +.getOrCreate() // Set default configs. Individual cases will change them if necessary. - sqlContext.conf.setConfString(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, "true") - sqlContext.conf.setConfString(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true") + spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, "true") + spark.conf.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true") def withTempPath(f: File => Unit): Unit = { val path = Utils.createTempDir() @@ -48,17 +52,17 @@ object ParquetReadBenchmark { } def withTempTable(tableNames: String*)(f: => Unit): Unit = { -try f finally tableNames.foreach(sqlContext.dropTempTable) +try f finally tableNames.foreach(spark.catalog.dropTempTable) } def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { val (keys, values) = pairs.unzip -val currentValues = keys.map(key => Try(sqlContext.conf.getConfString(key)).toOption) -(keys, values).zipped.foreach(sqlContext.conf.setConfString) +val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption) +(keys, values).zipped.foreach(spark.conf.set) try f finally { keys.zip(currentValues).foreach { -case (key, Some(value)) => sqlContext.conf.setConfString(key, value) -case (key, None) => sqlContext.conf.unsetConf(key) +case (key, Some(value)) => spark.conf.set(key, value) +case (key, None) => spark.conf.unset(key) } } } @@ -71,18 +75,18 @@ object ParquetReadBenchmark { withTempPath { dir => withTempTable("t1", "tempTable") { -sqlContext.range(values).registerTempTable("t1") -sqlContext.sql("select cast(id as INT) as id from t1") +spark.range(values).registerTempTable("t1") +spark.sql("select cast(id as INT) as id from t1") .write.parquet(dir.getCanonicalPath) - sqlContext.read.parquet(dir.getCanonicalPath).registerTempTable("tempTable") +spark.read.parquet(dir.getCanonicalPath).registerTempTable("tempTable") sqlBenchmark.addCase("SQL Parquet Vectorized") { iter => - sqlContext.sql("select sum(id) from tempTable").collect() + spark.sql("select sum(id) from tempTable").collect() } sqlBenchmark.addCase("SQL Parquet MR") { iter => withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { -sqlContext.sql("select sum(id) from tempTable").collect() +spark.sql("select sum(id) from tempTable").collect() } } @@ -155,20 +159,20 @@ object ParquetReadBenchmark { def intStringScanBenchmark(values: Int): Unit = { withTempPath { dir => withTempTable("t1", "tempTable") { -sqlContext.range(values).registerTempTable("t1") -sqlContext.sql("select cast(id as INT) as c1, cast(id as STRING) as c2 from t1") +spark.range(values).registerTempTable("t1") +spark.sql("select cast(id as INT) as c1, cast(id as STRING) as c2 from t1") .write.parquet(dir.getCanonicalPath) - sqlContext.read.parquet(dir.getCanonicalPath).registerTempTable("tempTable") +spark.read.parquet(dir.getCanonicalPath).registerTempTable("tempTable") val benchmark = new Benchmark("Int and String Scan", values) benchmark.addCase("SQL Parquet Vectorized") { iter => - sqlContext.sql("select sum(c1), sum(length(c2)) from tempTable").collect + spark.sql("select sum(c1),
[06/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/ed0b4070/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaDatasetAggregatorSuiteBase.java -- diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaDatasetAggregatorSuiteBase.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaDatasetAggregatorSuiteBase.java index 7863177..059c2d9 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaDatasetAggregatorSuiteBase.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaDatasetAggregatorSuiteBase.java @@ -26,36 +26,30 @@ import scala.Tuple2; import org.junit.After; import org.junit.Before; -import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.KeyValueGroupedDataset; -import org.apache.spark.sql.test.TestSQLContext; +import org.apache.spark.sql.test.TestSparkSession; /** * Common test base shared across this and Java8DatasetAggregatorSuite. */ public class JavaDatasetAggregatorSuiteBase implements Serializable { - protected transient JavaSparkContext jsc; - protected transient TestSQLContext context; + private transient TestSparkSession spark; @Before public void setUp() { // Trigger static initializer of TestData -SparkContext sc = new SparkContext("local[*]", "testing"); -jsc = new JavaSparkContext(sc); -context = new TestSQLContext(sc); -context.loadTestData(); +spark = new TestSparkSession(); +spark.loadTestData(); } @After public void tearDown() { -context.sparkContext().stop(); -context = null; -jsc = null; +spark.stop(); +spark = null; } protectedTuple2 tuple2(T1 t1, T2 t2) { @@ -66,7 +60,7 @@ public class JavaDatasetAggregatorSuiteBase implements Serializable { Encoder > encoder = Encoders.tuple(Encoders.STRING(), Encoders.INT()); List > data = Arrays.asList(tuple2("a", 1), tuple2("a", 2), tuple2("b", 3)); -Dataset > ds = context.createDataset(data, encoder); +Dataset > ds = spark.createDataset(data, encoder); return ds.groupByKey( new MapFunction , String>() { http://git-wip-us.apache.org/repos/asf/spark/blob/ed0b4070/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java -- diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java index 9e65158..d0435e4 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java @@ -19,14 +19,16 @@ package test.org.apache.spark.sql.sources; import java.io.File; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; -import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.*; @@ -37,8 +39,8 @@ import org.apache.spark.util.Utils; public class JavaSaveLoadSuite { - private transient JavaSparkContext sc; - private transient SQLContext sqlContext; + private transient SparkSession spark; + private transient JavaSparkContext jsc; File path; Dataset df; @@ -52,9 +54,11 @@ public class JavaSaveLoadSuite { @Before public void setUp() throws IOException { -SparkContext _sc = new SparkContext("local[*]", "testing"); -sqlContext = new SQLContext(_sc); -sc = new JavaSparkContext(_sc); +spark = SparkSession.builder() + .master("local[*]") + .appName("testing") + .getOrCreate(); +jsc = new JavaSparkContext(spark.sparkContext()); path = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource").getCanonicalFile(); @@ -66,16 +70,15 @@ public class JavaSaveLoadSuite { for (int i = 0; i < 10; i++) { jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}"); } -JavaRDD rdd = sc.parallelize(jsonObjects); -df = sqlContext.read().json(rdd); +JavaRDD rdd = jsc.parallelize(jsonObjects); +df = spark.read().json(rdd); df.registerTempTable("jsonTable"); } @After public void tearDown() { -sqlContext.sparkContext().stop(); -sqlContext = null; -sc = null; +
[01/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
Repository: spark Updated Branches: refs/heads/master bcfee153b -> ed0b4070f http://git-wip-us.apache.org/repos/asf/spark/blob/ed0b4070/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala index 0ba72b0..0f416eb 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala @@ -177,23 +177,23 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te (Seq[Integer](3), null, null)).toDF("key", "value1", "value2") data3.write.saveAsTable("agg3") -val emptyDF = sqlContext.createDataFrame( +val emptyDF = spark.createDataFrame( sparkContext.emptyRDD[Row], StructType(StructField("key", StringType) :: StructField("value", IntegerType) :: Nil)) emptyDF.registerTempTable("emptyTable") // Register UDAFs -sqlContext.udf.register("mydoublesum", new MyDoubleSum) -sqlContext.udf.register("mydoubleavg", new MyDoubleAvg) -sqlContext.udf.register("longProductSum", new LongProductSum) +spark.udf.register("mydoublesum", new MyDoubleSum) +spark.udf.register("mydoubleavg", new MyDoubleAvg) +spark.udf.register("longProductSum", new LongProductSum) } override def afterAll(): Unit = { try { - sqlContext.sql("DROP TABLE IF EXISTS agg1") - sqlContext.sql("DROP TABLE IF EXISTS agg2") - sqlContext.sql("DROP TABLE IF EXISTS agg3") - sqlContext.dropTempTable("emptyTable") + spark.sql("DROP TABLE IF EXISTS agg1") + spark.sql("DROP TABLE IF EXISTS agg2") + spark.sql("DROP TABLE IF EXISTS agg3") + spark.catalog.dropTempTable("emptyTable") } finally { super.afterAll() } @@ -210,7 +210,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te test("empty table") { // If there is no GROUP BY clause and the table is empty, we will generate a single row. checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT | AVG(value), @@ -227,7 +227,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(null, 0, 0, 0, null, null, null, null, null) :: Nil) checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT | AVG(value), @@ -246,7 +246,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te // If there is a GROUP BY clause and the table is empty, there is no output. checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT | AVG(value), @@ -266,7 +266,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te test("null literal") { checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT | AVG(null), @@ -282,7 +282,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te test("only do grouping") { checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT key |FROM agg1 @@ -291,7 +291,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(1) :: Row(2) :: Row(3) :: Row(null) :: Nil) checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT DISTINCT value1, key |FROM agg2 @@ -308,7 +308,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(null, null) :: Nil) checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT value1, key |FROM agg2 @@ -326,7 +326,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(null, null) :: Nil) checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT DISTINCT key |FROM agg3 @@ -341,7 +341,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(Seq[Integer](3)) :: Nil) checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT value1, key |FROM agg3 @@ -363,7 +363,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te test("case in-sensitive resolution") { checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT avg(value), kEY - 100 |FROM agg1 @@ -372,7 +372,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(20.0, -99) :: Row(-0.5,
[04/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/ed0b4070/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index 1742df3..c31dffe 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -27,16 +27,16 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { test("allowComments off") { val str = """{'name': /* hello */ 'Reynold Xin'}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.json(rdd) assert(df.schema.head.name == "_corrupt_record") } test("allowComments on") { val str = """{'name': /* hello */ 'Reynold Xin'}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.option("allowComments", "true").json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.option("allowComments", "true").json(rdd) assert(df.schema.head.name == "name") assert(df.first().getString(0) == "Reynold Xin") @@ -44,16 +44,16 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { test("allowSingleQuotes off") { val str = """{'name': 'Reynold Xin'}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.option("allowSingleQuotes", "false").json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.option("allowSingleQuotes", "false").json(rdd) assert(df.schema.head.name == "_corrupt_record") } test("allowSingleQuotes on") { val str = """{'name': 'Reynold Xin'}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.json(rdd) assert(df.schema.head.name == "name") assert(df.first().getString(0) == "Reynold Xin") @@ -61,16 +61,16 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { test("allowUnquotedFieldNames off") { val str = """{name: 'Reynold Xin'}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.json(rdd) assert(df.schema.head.name == "_corrupt_record") } test("allowUnquotedFieldNames on") { val str = """{name: 'Reynold Xin'}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.option("allowUnquotedFieldNames", "true").json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.option("allowUnquotedFieldNames", "true").json(rdd) assert(df.schema.head.name == "name") assert(df.first().getString(0) == "Reynold Xin") @@ -78,16 +78,16 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { test("allowNumericLeadingZeros off") { val str = """{"age": 0018}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.json(rdd) assert(df.schema.head.name == "_corrupt_record") } test("allowNumericLeadingZeros on") { val str = """{"age": 0018}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.option("allowNumericLeadingZeros", "true").json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.option("allowNumericLeadingZeros", "true").json(rdd) assert(df.schema.head.name == "age") assert(df.first().getLong(0) == 18) @@ -97,16 +97,16 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { // JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS. ignore("allowNonNumericNumbers off") { val str = """{"age": NaN}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df = spark.read.json(rdd) assert(df.schema.head.name == "_corrupt_record") } ignore("allowNonNumericNumbers on") { val str = """{"age": NaN}""" -val rdd = sqlContext.sparkContext.parallelize(Seq(str)) -val df = sqlContext.read.option("allowNonNumericNumbers", "true").json(rdd) +val rdd = spark.sparkContext.parallelize(Seq(str)) +val df =
[07/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 8e7e000..125ad02 100755 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{Dataset, Row} object StopWordsRemoverSuite extends SparkFunSuite { def testStopWordsRemover(t: StopWordsRemover, dataset: Dataset[_]): Unit = { @@ -42,7 +42,7 @@ class StopWordsRemoverSuite val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("test", "test"), Seq("test", "test")), (Seq("a", "b", "c", "d"), Seq("b", "c")), (Seq("a", "the", "an"), Seq()), @@ -60,7 +60,7 @@ class StopWordsRemoverSuite .setInputCol("raw") .setOutputCol("filtered") .setStopWords(stopWords) -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("test", "test"), Seq()), (Seq("a", "b", "c", "d"), Seq("b", "c", "d")), (Seq("a", "the", "an"), Seq()), @@ -77,7 +77,7 @@ class StopWordsRemoverSuite .setInputCol("raw") .setOutputCol("filtered") .setCaseSensitive(true) -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("A"), Seq("A")), (Seq("The", "the"), Seq("The")) )).toDF("raw", "expected") @@ -98,7 +98,7 @@ class StopWordsRemoverSuite .setInputCol("raw") .setOutputCol("filtered") .setStopWords(stopWords) -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("acaba", "ama", "biri"), Seq()), (Seq("hep", "her", "scala"), Seq("scala")) )).toDF("raw", "expected") @@ -112,7 +112,7 @@ class StopWordsRemoverSuite .setInputCol("raw") .setOutputCol("filtered") .setStopWords(stopWords.toArray) -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("python", "scala", "a"), Seq("python", "scala", "a")), (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift")) )).toDF("raw", "expected") @@ -126,7 +126,7 @@ class StopWordsRemoverSuite .setInputCol("raw") .setOutputCol("filtered") .setStopWords(stopWords.toArray) -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("python", "scala", "a"), Seq()), (Seq("Python", "Scala", "swift"), Seq("swift")) )).toDF("raw", "expected") @@ -148,7 +148,7 @@ class StopWordsRemoverSuite val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol(outputCol) -val dataSet = sqlContext.createDataFrame(Seq( +val dataSet = spark.createDataFrame(Seq( (Seq("The", "the", "swift"), Seq("swift")) )).toDF("raw", outputCol) http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala index d0f3cdc..c221d4a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala @@ -39,7 +39,7 @@ class StringIndexerSuite test("StringIndexer") { val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) -val df = sqlContext.createDataFrame(data).toDF("id", "label") +val df = spark.createDataFrame(data).toDF("id", "label") val indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex") @@ -63,8 +63,8 @@ class StringIndexerSuite test("StringIndexerUnseen") { val data = sc.parallelize(Seq((0, "a"), (1, "b"), (4, "b")), 2) val data2 = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c")), 2) -val df = sqlContext.createDataFrame(data).toDF("id", "label") -val df2 = sqlContext.createDataFrame(data2).toDF("id", "label") +val df = spark.createDataFrame(data).toDF("id", "label") +val df2 =
[01/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
Repository: spark Updated Branches: refs/heads/branch-2.0 19a9c23c2 -> 5bf74b44d http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala index 0ba72b0..0f416eb 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala @@ -177,23 +177,23 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te (Seq[Integer](3), null, null)).toDF("key", "value1", "value2") data3.write.saveAsTable("agg3") -val emptyDF = sqlContext.createDataFrame( +val emptyDF = spark.createDataFrame( sparkContext.emptyRDD[Row], StructType(StructField("key", StringType) :: StructField("value", IntegerType) :: Nil)) emptyDF.registerTempTable("emptyTable") // Register UDAFs -sqlContext.udf.register("mydoublesum", new MyDoubleSum) -sqlContext.udf.register("mydoubleavg", new MyDoubleAvg) -sqlContext.udf.register("longProductSum", new LongProductSum) +spark.udf.register("mydoublesum", new MyDoubleSum) +spark.udf.register("mydoubleavg", new MyDoubleAvg) +spark.udf.register("longProductSum", new LongProductSum) } override def afterAll(): Unit = { try { - sqlContext.sql("DROP TABLE IF EXISTS agg1") - sqlContext.sql("DROP TABLE IF EXISTS agg2") - sqlContext.sql("DROP TABLE IF EXISTS agg3") - sqlContext.dropTempTable("emptyTable") + spark.sql("DROP TABLE IF EXISTS agg1") + spark.sql("DROP TABLE IF EXISTS agg2") + spark.sql("DROP TABLE IF EXISTS agg3") + spark.catalog.dropTempTable("emptyTable") } finally { super.afterAll() } @@ -210,7 +210,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te test("empty table") { // If there is no GROUP BY clause and the table is empty, we will generate a single row. checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT | AVG(value), @@ -227,7 +227,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(null, 0, 0, 0, null, null, null, null, null) :: Nil) checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT | AVG(value), @@ -246,7 +246,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te // If there is a GROUP BY clause and the table is empty, there is no output. checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT | AVG(value), @@ -266,7 +266,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te test("null literal") { checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT | AVG(null), @@ -282,7 +282,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te test("only do grouping") { checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT key |FROM agg1 @@ -291,7 +291,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(1) :: Row(2) :: Row(3) :: Row(null) :: Nil) checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT DISTINCT value1, key |FROM agg2 @@ -308,7 +308,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(null, null) :: Nil) checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT value1, key |FROM agg2 @@ -326,7 +326,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(null, null) :: Nil) checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT DISTINCT key |FROM agg3 @@ -341,7 +341,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(Seq[Integer](3)) :: Nil) checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT value1, key |FROM agg3 @@ -363,7 +363,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te test("case in-sensitive resolution") { checkAnswer( - sqlContext.sql( + spark.sql( """ |SELECT avg(value), kEY - 100 |FROM agg1 @@ -372,7 +372,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te Row(20.0, -99) ::
[09/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java -- diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java index 9f81751..00c59f0 100644 --- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java @@ -30,25 +30,26 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; -import static org.apache.spark.mllib.classification.LogisticRegressionSuite - .generateLogisticInputAsList; - +import org.apache.spark.sql.SparkSession; +import static org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInputAsList; public class JavaLinearRegressionSuite implements Serializable { + private transient SparkSession spark; private transient JavaSparkContext jsc; - private transient SQLContext jsql; private transient Dataset dataset; private transient JavaRDD datasetRDD; @Before public void setUp() { -jsc = new JavaSparkContext("local", "JavaLinearRegressionSuite"); -jsql = new SQLContext(jsc); +spark = SparkSession.builder() + .master("local") + .appName("JavaLinearRegressionSuite") + .getOrCreate(); +jsc = new JavaSparkContext(spark.sparkContext()); List points = generateLogisticInputAsList(1.0, 1.0, 100, 42); datasetRDD = jsc.parallelize(points, 2); -dataset = jsql.createDataFrame(datasetRDD, LabeledPoint.class); +dataset = spark.createDataFrame(datasetRDD, LabeledPoint.class); dataset.registerTempTable("dataset"); } @@ -65,7 +66,7 @@ public class JavaLinearRegressionSuite implements Serializable { assertEquals("auto", lr.getSolver()); LinearRegressionModel model = lr.fit(dataset); model.transform(dataset).registerTempTable("prediction"); -Dataset predictions = jsql.sql("SELECT label, prediction FROM prediction"); +Dataset predictions = spark.sql("SELECT label, prediction FROM prediction"); predictions.collect(); // Check defaults assertEquals("features", model.getFeaturesCol()); @@ -76,8 +77,8 @@ public class JavaLinearRegressionSuite implements Serializable { public void linearRegressionWithSetters() { // Set params, train, and check as many params as we can. LinearRegression lr = new LinearRegression() -.setMaxIter(10) -.setRegParam(1.0).setSolver("l-bfgs"); + .setMaxIter(10) + .setRegParam(1.0).setSolver("l-bfgs"); LinearRegressionModel model = lr.fit(dataset); LinearRegression parent = (LinearRegression) model.parent(); assertEquals(10, parent.getMaxIter()); @@ -85,7 +86,7 @@ public class JavaLinearRegressionSuite implements Serializable { // Call fit() with new params, and check as many params as we can. LinearRegressionModel model2 = -lr.fit(dataset, lr.maxIter().w(5), lr.regParam().w(0.1), lr.predictionCol().w("thePred")); + lr.fit(dataset, lr.maxIter().w(5), lr.regParam().w(0.1), lr.predictionCol().w("thePred")); LinearRegression parent2 = (LinearRegression) model2.parent(); assertEquals(5, parent2.getMaxIter()); assertEquals(0.1, parent2.getRegParam(), 0.0); http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java -- diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java index 38b895f..fdb41ff 100644 --- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaRandomForestRegressorSuite.java @@ -28,27 +28,33 @@ import org.junit.Test; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.classification.LogisticRegressionSuite; import org.apache.spark.ml.tree.impl.TreeTests; +import org.apache.spark.mllib.classification.LogisticRegressionSuite; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; public class JavaRandomForestRegressorSuite implements Serializable { - private transient JavaSparkContext sc; + private transient SparkSession spark; + private transient JavaSparkContext
[08/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java -- diff --git a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java index 3db9b39..8b05675 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java @@ -32,15 +32,17 @@ import org.junit.Test; import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; public class JavaIsotonicRegressionSuite implements Serializable { - private transient JavaSparkContext sc; + private transient SparkSession spark; + private transient JavaSparkContext jsc; private static List> generateIsotonicInput(double[] labels) { List > input = new ArrayList<>(labels.length); for (int i = 1; i <= labels.length; i++) { - input.add(new Tuple3<>(labels[i-1], (double) i, 1.0)); + input.add(new Tuple3<>(labels[i - 1], (double) i, 1.0)); } return input; @@ -48,20 +50,24 @@ public class JavaIsotonicRegressionSuite implements Serializable { private IsotonicRegressionModel runIsotonicRegression(double[] labels) { JavaRDD > trainRDD = - sc.parallelize(generateIsotonicInput(labels), 2).cache(); + jsc.parallelize(generateIsotonicInput(labels), 2).cache(); return new IsotonicRegression().run(trainRDD); } @Before public void setUp() { -sc = new JavaSparkContext("local", "JavaLinearRegressionSuite"); +spark = SparkSession.builder() + .master("local") + .appName("JavaLinearRegressionSuite") + .getOrCreate(); +jsc = new JavaSparkContext(spark.sparkContext()); } @After public void tearDown() { -sc.stop(); -sc = null; +spark.stop(); +spark = null; } @Test @@ -70,7 +76,7 @@ public class JavaIsotonicRegressionSuite implements Serializable { runIsotonicRegression(new double[]{1, 2, 3, 3, 1, 6, 7, 8, 11, 9, 10, 12}); Assert.assertArrayEquals( - new double[] {1, 2, 7.0/3, 7.0/3, 6, 7, 8, 10, 10, 12}, model.predictions(), 1.0e-14); + new double[]{1, 2, 7.0 / 3, 7.0 / 3, 6, 7, 8, 10, 10, 12}, model.predictions(), 1.0e-14); } @Test @@ -78,7 +84,7 @@ public class JavaIsotonicRegressionSuite implements Serializable { IsotonicRegressionModel model = runIsotonicRegression(new double[]{1, 2, 3, 3, 1, 6, 7, 8, 11, 9, 10, 12}); -JavaDoubleRDD testRDD = sc.parallelizeDoubles(Arrays.asList(0.0, 1.0, 9.5, 12.0, 13.0)); +JavaDoubleRDD testRDD = jsc.parallelizeDoubles(Arrays.asList(0.0, 1.0, 9.5, 12.0, 13.0)); List predictions = model.predict(testRDD).collect(); Assert.assertEquals(1.0, predictions.get(0).doubleValue(), 1.0e-14); http://git-wip-us.apache.org/repos/asf/spark/blob/5bf74b44/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLassoSuite.java -- diff --git a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLassoSuite.java b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLassoSuite.java index 8950b48..098bac3 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLassoSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLassoSuite.java @@ -28,24 +28,30 @@ import org.junit.Test; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.util.LinearDataGenerator; +import org.apache.spark.sql.SparkSession; public class JavaLassoSuite implements Serializable { - private transient JavaSparkContext sc; + private transient SparkSession spark; + private transient JavaSparkContext jsc; @Before public void setUp() { -sc = new JavaSparkContext("local", "JavaLassoSuite"); +spark = SparkSession.builder() + .master("local") + .appName("JavaLassoSuite") + .getOrCreate(); +jsc = new JavaSparkContext(spark.sparkContext()); } @After public void tearDown() { -sc.stop(); -sc = null; +spark.stop(); +spark = null; } int validatePrediction(List validationData, LassoModel model) { int numAccurate = 0; -for (LabeledPoint point: validationData) { +for (LabeledPoint point : validationData) { Double prediction = model.predict(point.features()); // A prediction is off if the prediction is more than 0.5 away from expected value. if (Math.abs(prediction - point.label()) <= 0.5) { @@
[02/10] spark git commit: [SPARK-15037][SQL][MLLIB] Use SparkSession instead of SQLContext in Scala/Java TestSuites
http://git-wip-us.apache.org/repos/asf/spark/blob/ed0b4070/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index a9b1970..a2decad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -29,11 +29,11 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext { val path = Utils.createTempDir() path.delete() -val df = sqlContext.range(100).select($"id", lit(1).as("data")) +val df = spark.range(100).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( - sqlContext.read.load(path.getCanonicalPath), + spark.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) @@ -43,12 +43,12 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext { val path = Utils.createTempDir() path.delete() -val base = sqlContext.range(100) +val base = spark.range(100) val df = base.union(base).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( - sqlContext.read.load(path.getCanonicalPath), + spark.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq ++ (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) @@ -58,7 +58,7 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext { withTempPath { f => val path = f.getAbsolutePath Seq(1 -> "a").toDF("i", "j").write.partitionBy("i").parquet(path) - assert(sqlContext.read.parquet(path).schema.map(_.name) == Seq("j", "i")) + assert(spark.read.parquet(path).schema.map(_.name) == Seq("j", "i")) } } } http://git-wip-us.apache.org/repos/asf/spark/blob/ed0b4070/sql/core/src/test/scala/org/apache/spark/sql/streaming/ContinuousQueryManagerSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ContinuousQueryManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ContinuousQueryManagerSuite.scala index 3d69c8a..a743cdd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ContinuousQueryManagerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ContinuousQueryManagerSuite.scala @@ -41,13 +41,13 @@ class ContinuousQueryManagerSuite extends StreamTest with SharedSQLContext with override val streamingTimeout = 20.seconds before { -assert(sqlContext.streams.active.isEmpty) -sqlContext.streams.resetTerminated() +assert(spark.streams.active.isEmpty) +spark.streams.resetTerminated() } after { -assert(sqlContext.streams.active.isEmpty) -sqlContext.streams.resetTerminated() +assert(spark.streams.active.isEmpty) +spark.streams.resetTerminated() } testQuietly("listing") { @@ -57,26 +57,26 @@ class ContinuousQueryManagerSuite extends StreamTest with SharedSQLContext with withQueriesOn(ds1, ds2, ds3) { queries => require(queries.size === 3) - assert(sqlContext.streams.active.toSet === queries.toSet) + assert(spark.streams.active.toSet === queries.toSet) val (q1, q2, q3) = (queries(0), queries(1), queries(2)) - assert(sqlContext.streams.get(q1.name).eq(q1)) - assert(sqlContext.streams.get(q2.name).eq(q2)) - assert(sqlContext.streams.get(q3.name).eq(q3)) + assert(spark.streams.get(q1.name).eq(q1)) + assert(spark.streams.get(q2.name).eq(q2)) + assert(spark.streams.get(q3.name).eq(q3)) intercept[IllegalArgumentException] { -sqlContext.streams.get("non-existent-name") +spark.streams.get("non-existent-name") } q1.stop() - assert(sqlContext.streams.active.toSet === Set(q2, q3)) + assert(spark.streams.active.toSet === Set(q2, q3)) val ex1 = withClue("no error while getting non-active query") { intercept[IllegalArgumentException] { - sqlContext.streams.get(q1.name) + spark.streams.get(q1.name) } } assert(ex1.getMessage.contains(q1.name), "error does not contain name of query to be fetched") - assert(sqlContext.streams.get(q2.name).eq(q2)) + assert(spark.streams.get(q2.name).eq(q2)) m2.addData(0) // q2 should terminate with error @@ -86,11 +86,11 @@ class ContinuousQueryManagerSuite extends StreamTest with SharedSQLContext with } withClue("no error while getting non-active query") { intercept[IllegalArgumentException] { -
spark git commit: [SPARK-15126][SQL] RuntimeConfig.set should return Unit
Repository: spark Updated Branches: refs/heads/branch-2.0 e868a15a7 -> 45862f6c9 [SPARK-15126][SQL] RuntimeConfig.set should return Unit ## What changes were proposed in this pull request? Currently we return RuntimeConfig itself to facilitate chaining. However, it makes the output in interactive environments (e.g. notebooks, scala repl) weird because it'd show the response of calling set as a RuntimeConfig itself. ## How was this patch tested? Updated unit tests. Author: Reynold XinCloses #12902 from rxin/SPARK-15126. (cherry picked from commit 6ae9fc00ed6ef530a9c42c8407fc66fd873239cc) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/45862f6c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/45862f6c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/45862f6c Branch: refs/heads/branch-2.0 Commit: 45862f6c935c36969a62a3fbb863cce55c4a6426 Parents: e868a15 Author: Reynold Xin Authored: Wed May 4 14:26:05 2016 -0700 Committer: Andrew Or Committed: Wed May 4 14:26:16 2016 -0700 -- python/pyspark/sql/conf.py | 1 - python/pyspark/sql/session.py | 3 -- .../org/apache/spark/sql/RuntimeConfig.scala| 7 ++- .../apache/spark/sql/RuntimeConfigSuite.scala | 57 .../spark/sql/internal/RuntimeConfigSuite.scala | 57 5 files changed, 60 insertions(+), 65 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/45862f6c/python/pyspark/sql/conf.py -- diff --git a/python/pyspark/sql/conf.py b/python/pyspark/sql/conf.py index 7428c91..609d882 100644 --- a/python/pyspark/sql/conf.py +++ b/python/pyspark/sql/conf.py @@ -23,7 +23,6 @@ class RuntimeConfig(object): """User-facing configuration API, accessible through `SparkSession.conf`. Options set here are automatically propagated to the Hadoop configuration during I/O. -This a thin wrapper around its Scala implementation org.apache.spark.sql.RuntimeConfig. """ def __init__(self, jconf): http://git-wip-us.apache.org/repos/asf/spark/blob/45862f6c/python/pyspark/sql/session.py -- diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index fb3e318..04842f6 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -71,9 +71,6 @@ class SparkSession(object): .config("spark.some.config.option", "some-value") \ .getOrCreate() -:param sparkContext: The :class:`SparkContext` backing this SparkSession. -:param jsparkSession: An optional JVM Scala SparkSession. If set, we do not instantiate a new -SparkSession in the JVM, instead we make all calls to this object. """ class Builder(object): http://git-wip-us.apache.org/repos/asf/spark/blob/45862f6c/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala index 4fd6e42..7e07e0c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala @@ -35,9 +35,8 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { * * @since 2.0.0 */ - def set(key: String, value: String): RuntimeConfig = { + def set(key: String, value: String): Unit = { sqlConf.setConfString(key, value) -this } /** @@ -45,7 +44,7 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { * * @since 2.0.0 */ - def set(key: String, value: Boolean): RuntimeConfig = { + def set(key: String, value: Boolean): Unit = { set(key, value.toString) } @@ -54,7 +53,7 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { * * @since 2.0.0 */ - def set(key: String, value: Long): RuntimeConfig = { + def set(key: String, value: Long): Unit = { set(key, value.toString) } http://git-wip-us.apache.org/repos/asf/spark/blob/45862f6c/sql/core/src/test/scala/org/apache/spark/sql/RuntimeConfigSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RuntimeConfigSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RuntimeConfigSuite.scala new file mode 100644 index 000..cfe2e9f --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/RuntimeConfigSuite.scala @@ -0,0
spark git commit: [SPARK-15126][SQL] RuntimeConfig.set should return Unit
Repository: spark Updated Branches: refs/heads/master 0fd3a4748 -> 6ae9fc00e [SPARK-15126][SQL] RuntimeConfig.set should return Unit ## What changes were proposed in this pull request? Currently we return RuntimeConfig itself to facilitate chaining. However, it makes the output in interactive environments (e.g. notebooks, scala repl) weird because it'd show the response of calling set as a RuntimeConfig itself. ## How was this patch tested? Updated unit tests. Author: Reynold XinCloses #12902 from rxin/SPARK-15126. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6ae9fc00 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6ae9fc00 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6ae9fc00 Branch: refs/heads/master Commit: 6ae9fc00ed6ef530a9c42c8407fc66fd873239cc Parents: 0fd3a47 Author: Reynold Xin Authored: Wed May 4 14:26:05 2016 -0700 Committer: Andrew Or Committed: Wed May 4 14:26:05 2016 -0700 -- python/pyspark/sql/conf.py | 1 - python/pyspark/sql/session.py | 3 -- .../org/apache/spark/sql/RuntimeConfig.scala| 7 ++- .../apache/spark/sql/RuntimeConfigSuite.scala | 57 .../spark/sql/internal/RuntimeConfigSuite.scala | 57 5 files changed, 60 insertions(+), 65 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6ae9fc00/python/pyspark/sql/conf.py -- diff --git a/python/pyspark/sql/conf.py b/python/pyspark/sql/conf.py index 7428c91..609d882 100644 --- a/python/pyspark/sql/conf.py +++ b/python/pyspark/sql/conf.py @@ -23,7 +23,6 @@ class RuntimeConfig(object): """User-facing configuration API, accessible through `SparkSession.conf`. Options set here are automatically propagated to the Hadoop configuration during I/O. -This a thin wrapper around its Scala implementation org.apache.spark.sql.RuntimeConfig. """ def __init__(self, jconf): http://git-wip-us.apache.org/repos/asf/spark/blob/6ae9fc00/python/pyspark/sql/session.py -- diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index fb3e318..04842f6 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -71,9 +71,6 @@ class SparkSession(object): .config("spark.some.config.option", "some-value") \ .getOrCreate() -:param sparkContext: The :class:`SparkContext` backing this SparkSession. -:param jsparkSession: An optional JVM Scala SparkSession. If set, we do not instantiate a new -SparkSession in the JVM, instead we make all calls to this object. """ class Builder(object): http://git-wip-us.apache.org/repos/asf/spark/blob/6ae9fc00/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala index 4fd6e42..7e07e0c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala @@ -35,9 +35,8 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { * * @since 2.0.0 */ - def set(key: String, value: String): RuntimeConfig = { + def set(key: String, value: String): Unit = { sqlConf.setConfString(key, value) -this } /** @@ -45,7 +44,7 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { * * @since 2.0.0 */ - def set(key: String, value: Boolean): RuntimeConfig = { + def set(key: String, value: Boolean): Unit = { set(key, value.toString) } @@ -54,7 +53,7 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { * * @since 2.0.0 */ - def set(key: String, value: Long): RuntimeConfig = { + def set(key: String, value: Long): Unit = { set(key, value.toString) } http://git-wip-us.apache.org/repos/asf/spark/blob/6ae9fc00/sql/core/src/test/scala/org/apache/spark/sql/RuntimeConfigSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RuntimeConfigSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RuntimeConfigSuite.scala new file mode 100644 index 000..cfe2e9f --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/RuntimeConfigSuite.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the
spark git commit: [SPARK-15121] Improve logging of external shuffle handler
Repository: spark Updated Branches: refs/heads/master 6ae9fc00e -> 0c00391f7 [SPARK-15121] Improve logging of external shuffle handler ## What changes were proposed in this pull request? Add more informative logging in the external shuffle service to aid in debugging who is connecting to the YARN Nodemanager when the external shuffle service runs under it. ## How was this patch tested? Ran and saw logs coming out in log file. Author: Thomas GravesCloses #12900 from tgravescs/SPARK-15121. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0c00391f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0c00391f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0c00391f Branch: refs/heads/master Commit: 0c00391f77359efdbf9dbd26d4c8186be8839922 Parents: 6ae9fc0 Author: Thomas Graves Authored: Wed May 4 14:28:26 2016 -0700 Committer: Andrew Or Committed: Wed May 4 14:28:26 2016 -0700 -- .../spark/network/shuffle/ExternalShuffleBlockHandler.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0c00391f/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java -- diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java index f8d03b3..fb1226c 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java @@ -35,6 +35,7 @@ import org.apache.spark.network.server.RpcHandler; import org.apache.spark.network.server.StreamManager; import org.apache.spark.network.shuffle.ExternalShuffleBlockResolver.AppExecId; import org.apache.spark.network.shuffle.protocol.*; +import org.apache.spark.network.util.NettyUtils; import org.apache.spark.network.util.TransportConf; @@ -86,7 +87,8 @@ public class ExternalShuffleBlockHandler extends RpcHandler { blocks.add(blockManager.getBlockData(msg.appId, msg.execId, blockId)); } long streamId = streamManager.registerStream(client.getClientId(), blocks.iterator()); - logger.trace("Registered streamId {} with {} buffers", streamId, msg.blockIds.length); + logger.trace("Registered streamId {} with {} buffers for client {} from host {}", streamId, +msg.blockIds.length, client.getClientId(), NettyUtils.getRemoteAddress(client.getChannel())); callback.onSuccess(new StreamHandle(streamId, msg.blockIds.length).toByteBuffer()); } else if (msgObj instanceof RegisterExecutor) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15121] Improve logging of external shuffle handler
Repository: spark Updated Branches: refs/heads/branch-2.0 45862f6c9 -> eeb18f6d7 [SPARK-15121] Improve logging of external shuffle handler ## What changes were proposed in this pull request? Add more informative logging in the external shuffle service to aid in debugging who is connecting to the YARN Nodemanager when the external shuffle service runs under it. ## How was this patch tested? Ran and saw logs coming out in log file. Author: Thomas GravesCloses #12900 from tgravescs/SPARK-15121. (cherry picked from commit 0c00391f77359efdbf9dbd26d4c8186be8839922) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eeb18f6d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eeb18f6d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eeb18f6d Branch: refs/heads/branch-2.0 Commit: eeb18f6d70bc75f6d1292938292ad066d85ced8a Parents: 45862f6 Author: Thomas Graves Authored: Wed May 4 14:28:26 2016 -0700 Committer: Andrew Or Committed: Wed May 4 14:28:40 2016 -0700 -- .../spark/network/shuffle/ExternalShuffleBlockHandler.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eeb18f6d/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java -- diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java index f8d03b3..fb1226c 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java @@ -35,6 +35,7 @@ import org.apache.spark.network.server.RpcHandler; import org.apache.spark.network.server.StreamManager; import org.apache.spark.network.shuffle.ExternalShuffleBlockResolver.AppExecId; import org.apache.spark.network.shuffle.protocol.*; +import org.apache.spark.network.util.NettyUtils; import org.apache.spark.network.util.TransportConf; @@ -86,7 +87,8 @@ public class ExternalShuffleBlockHandler extends RpcHandler { blocks.add(blockManager.getBlockData(msg.appId, msg.execId, blockId)); } long streamId = streamManager.registerStream(client.getClientId(), blocks.iterator()); - logger.trace("Registered streamId {} with {} buffers", streamId, msg.blockIds.length); + logger.trace("Registered streamId {} with {} buffers for client {} from host {}", streamId, +msg.blockIds.length, client.getClientId(), NettyUtils.getRemoteAddress(client.getChannel())); callback.onSuccess(new StreamHandle(streamId, msg.blockIds.length).toByteBuffer()); } else if (msgObj instanceof RegisterExecutor) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12299][CORE] Remove history serving functionality from Master
Repository: spark Updated Branches: refs/heads/branch-2.0 eeb18f6d7 -> c0715f33b [SPARK-12299][CORE] Remove history serving functionality from Master Remove history server functionality from standalone Master. Previously, the Master process rebuilt a SparkUI once the application was completed which sometimes caused problems, such as OOM, when the application event log is large (see SPARK-6270). Keeping this functionality out of the Master will help to simplify the process and increase stability. Testing for this change included running core unit tests and manually running an application on a standalone cluster to verify that it completed successfully and that the Master UI functioned correctly. Also added 2 unit tests to verify killing an application and driver from MasterWebUI makes the correct request to the Master. Author: Bryan CutlerCloses #10991 from BryanCutler/remove-history-master-SPARK-12299. (cherry picked from commit cf2e9da612397233ae7bca0e9ce57309f16226b5) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0715f33 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0715f33 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0715f33 Branch: refs/heads/branch-2.0 Commit: c0715f33b456f8379117a6ecae3ff2cda6f59a7c Parents: eeb18f6 Author: Bryan Cutler Authored: Wed May 4 14:29:54 2016 -0700 Committer: Andrew Or Committed: Wed May 4 14:30:07 2016 -0700 -- .../spark/deploy/master/ApplicationInfo.scala | 9 -- .../org/apache/spark/deploy/master/Master.scala | 109 + .../spark/deploy/master/MasterMessages.scala| 2 - .../deploy/master/ui/ApplicationPage.scala | 6 +- .../deploy/master/ui/HistoryNotFoundPage.scala | 73 .../spark/deploy/master/ui/MasterPage.scala | 8 +- .../spark/deploy/master/ui/MasterWebUI.scala| 40 +-- .../spark/status/api/v1/ApiRootResource.scala | 2 +- .../status/api/v1/ApplicationListResource.scala | 30 - .../deploy/master/ui/MasterWebUISuite.scala | 118 +++ docs/monitoring.md | 5 - 11 files changed, 86 insertions(+), 316 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c0715f33/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala index 4ffb528..53564d0 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala @@ -41,7 +41,6 @@ private[spark] class ApplicationInfo( @transient var coresGranted: Int = _ @transient var endTime: Long = _ @transient var appSource: ApplicationSource = _ - @transient @volatile var appUIUrlAtHistoryServer: Option[String] = None // A cap on the number of executors this application can have at any given time. // By default, this is infinite. Only after the first allocation request is issued by the @@ -66,7 +65,6 @@ private[spark] class ApplicationInfo( nextExecutorId = 0 removedExecutors = new ArrayBuffer[ExecutorDesc] executorLimit = desc.initialExecutorLimit.getOrElse(Integer.MAX_VALUE) -appUIUrlAtHistoryServer = None } private def newExecutorId(useID: Option[Int] = None): Int = { @@ -136,11 +134,4 @@ private[spark] class ApplicationInfo( System.currentTimeMillis() - startTime } } - - /** - * Returns the original application UI url unless there is its address at history server - * is defined - */ - def curAppUIUrl: String = appUIUrlAtHistoryServer.getOrElse(desc.appUiUrl) - } http://git-wip-us.apache.org/repos/asf/spark/blob/c0715f33/core/src/main/scala/org/apache/spark/deploy/master/Master.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index edc9be2..faed4f4 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -17,25 +17,17 @@ package org.apache.spark.deploy.master -import java.io.FileNotFoundException -import java.net.URLEncoder import java.text.SimpleDateFormat import java.util.Date -import java.util.concurrent.{ConcurrentHashMap, ScheduledFuture, TimeUnit} +import java.util.concurrent.{ScheduledFuture, TimeUnit} import
spark git commit: [SPARK-12299][CORE] Remove history serving functionality from Master
Repository: spark Updated Branches: refs/heads/master 0c00391f7 -> cf2e9da61 [SPARK-12299][CORE] Remove history serving functionality from Master Remove history server functionality from standalone Master. Previously, the Master process rebuilt a SparkUI once the application was completed which sometimes caused problems, such as OOM, when the application event log is large (see SPARK-6270). Keeping this functionality out of the Master will help to simplify the process and increase stability. Testing for this change included running core unit tests and manually running an application on a standalone cluster to verify that it completed successfully and that the Master UI functioned correctly. Also added 2 unit tests to verify killing an application and driver from MasterWebUI makes the correct request to the Master. Author: Bryan CutlerCloses #10991 from BryanCutler/remove-history-master-SPARK-12299. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cf2e9da6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cf2e9da6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cf2e9da6 Branch: refs/heads/master Commit: cf2e9da612397233ae7bca0e9ce57309f16226b5 Parents: 0c00391 Author: Bryan Cutler Authored: Wed May 4 14:29:54 2016 -0700 Committer: Andrew Or Committed: Wed May 4 14:29:54 2016 -0700 -- .../spark/deploy/master/ApplicationInfo.scala | 9 -- .../org/apache/spark/deploy/master/Master.scala | 109 + .../spark/deploy/master/MasterMessages.scala| 2 - .../deploy/master/ui/ApplicationPage.scala | 6 +- .../deploy/master/ui/HistoryNotFoundPage.scala | 73 .../spark/deploy/master/ui/MasterPage.scala | 8 +- .../spark/deploy/master/ui/MasterWebUI.scala| 40 +-- .../spark/status/api/v1/ApiRootResource.scala | 2 +- .../status/api/v1/ApplicationListResource.scala | 30 - .../deploy/master/ui/MasterWebUISuite.scala | 118 +++ docs/monitoring.md | 5 - 11 files changed, 86 insertions(+), 316 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cf2e9da6/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala index 4ffb528..53564d0 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala @@ -41,7 +41,6 @@ private[spark] class ApplicationInfo( @transient var coresGranted: Int = _ @transient var endTime: Long = _ @transient var appSource: ApplicationSource = _ - @transient @volatile var appUIUrlAtHistoryServer: Option[String] = None // A cap on the number of executors this application can have at any given time. // By default, this is infinite. Only after the first allocation request is issued by the @@ -66,7 +65,6 @@ private[spark] class ApplicationInfo( nextExecutorId = 0 removedExecutors = new ArrayBuffer[ExecutorDesc] executorLimit = desc.initialExecutorLimit.getOrElse(Integer.MAX_VALUE) -appUIUrlAtHistoryServer = None } private def newExecutorId(useID: Option[Int] = None): Int = { @@ -136,11 +134,4 @@ private[spark] class ApplicationInfo( System.currentTimeMillis() - startTime } } - - /** - * Returns the original application UI url unless there is its address at history server - * is defined - */ - def curAppUIUrl: String = appUIUrlAtHistoryServer.getOrElse(desc.appUiUrl) - } http://git-wip-us.apache.org/repos/asf/spark/blob/cf2e9da6/core/src/main/scala/org/apache/spark/deploy/master/Master.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index edc9be2..faed4f4 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -17,25 +17,17 @@ package org.apache.spark.deploy.master -import java.io.FileNotFoundException -import java.net.URLEncoder import java.text.SimpleDateFormat import java.util.Date -import java.util.concurrent.{ConcurrentHashMap, ScheduledFuture, TimeUnit} +import java.util.concurrent.{ScheduledFuture, TimeUnit} import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} -import scala.concurrent.{ExecutionContext, Future} -import scala.concurrent.duration.Duration -import
spark git commit: [SPARK-13001][CORE][MESOS] Prevent getting offers when reached max cores
Repository: spark Updated Branches: refs/heads/branch-2.0 23789e358 -> 1e7d9bfb5 [SPARK-13001][CORE][MESOS] Prevent getting offers when reached max cores Similar to https://github.com/apache/spark/pull/8639 This change rejects offers for 120s when reached `spark.cores.max` in coarse-grained mode to mitigate offer starvation. This prevents Mesos to send us offers again and again, starving other frameworks. This is especially problematic when running many small frameworks on the same Mesos cluster, e.g. many small Sparks streaming jobs, and cause the bigger spark jobs to stop receiving offers. By rejecting the offers for a long period of time, they become available to those other frameworks. Author: Sebastien RainvilleCloses #10924 from sebastienrainville/master. (cherry picked from commit eb019af9a9cadb127eab1b6d30312169ed90f808) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1e7d9bfb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1e7d9bfb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1e7d9bfb Branch: refs/heads/branch-2.0 Commit: 1e7d9bfb5a41f5c2479ab3b4d4081f00bf00bd31 Parents: 23789e3 Author: Sebastien Rainville Authored: Wed May 4 14:32:36 2016 -0700 Committer: Andrew Or Committed: Wed May 4 14:32:47 2016 -0700 -- .../mesos/CoarseMesosSchedulerBackend.scala | 53 +--- .../cluster/mesos/MesosSchedulerUtils.scala | 4 ++ .../CoarseMesosSchedulerBackendSuite.scala | 13 + 3 files changed, 53 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1e7d9bfb/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala index 50b452c..2c5be1f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala @@ -109,10 +109,14 @@ private[spark] class CoarseMesosSchedulerBackend( private val slaveOfferConstraints = parseConstraintString(sc.conf.get("spark.mesos.constraints", "")) - // reject offers with mismatched constraints in seconds + // Reject offers with mismatched constraints in seconds private val rejectOfferDurationForUnmetConstraints = getRejectOfferDurationForUnmetConstraints(sc) + // Reject offers when we reached the maximum number of cores for this framework + private val rejectOfferDurationForReachedMaxCores = +getRejectOfferDurationForReachedMaxCores(sc) + // A client for talking to the external shuffle service private val mesosExternalShuffleClient: Option[MesosExternalShuffleClient] = { if (shuffleServiceEnabled) { @@ -279,18 +283,32 @@ private[spark] class CoarseMesosSchedulerBackend( } private def declineUnmatchedOffers(d: SchedulerDriver, offers: Buffer[Offer]): Unit = { -for (offer <- offers) { - val id = offer.getId.getValue - val offerAttributes = toAttributeMap(offer.getAttributesList) - val mem = getResource(offer.getResourcesList, "mem") - val cpus = getResource(offer.getResourcesList, "cpus") - val filters = Filters.newBuilder() -.setRefuseSeconds(rejectOfferDurationForUnmetConstraints).build() - - logDebug(s"Declining offer: $id with attributes: $offerAttributes mem: $mem cpu: $cpus" -+ s" for $rejectOfferDurationForUnmetConstraints seconds") +offers.foreach { offer => + declineOffer(d, offer, Some("unmet constraints"), +Some(rejectOfferDurationForUnmetConstraints)) +} + } - d.declineOffer(offer.getId, filters) + private def declineOffer( + d: SchedulerDriver, + offer: Offer, + reason: Option[String] = None, + refuseSeconds: Option[Long] = None): Unit = { + +val id = offer.getId.getValue +val offerAttributes = toAttributeMap(offer.getAttributesList) +val mem = getResource(offer.getResourcesList, "mem") +val cpus = getResource(offer.getResourcesList, "cpus") + +logDebug(s"Declining offer: $id with attributes: $offerAttributes mem: $mem" + + s" cpu: $cpus for $refuseSeconds seconds" + + reason.map(r => s" (reason: $r)").getOrElse("")) + +refuseSeconds match { + case Some(seconds) => +val filters = Filters.newBuilder().setRefuseSeconds(seconds).build() +d.declineOffer(offer.getId, filters) + case _ =>
spark git commit: [SPARK-13001][CORE][MESOS] Prevent getting offers when reached max cores
Repository: spark Updated Branches: refs/heads/master cdce4e62a -> eb019af9a [SPARK-13001][CORE][MESOS] Prevent getting offers when reached max cores Similar to https://github.com/apache/spark/pull/8639 This change rejects offers for 120s when reached `spark.cores.max` in coarse-grained mode to mitigate offer starvation. This prevents Mesos to send us offers again and again, starving other frameworks. This is especially problematic when running many small frameworks on the same Mesos cluster, e.g. many small Sparks streaming jobs, and cause the bigger spark jobs to stop receiving offers. By rejecting the offers for a long period of time, they become available to those other frameworks. Author: Sebastien RainvilleCloses #10924 from sebastienrainville/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eb019af9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eb019af9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eb019af9 Branch: refs/heads/master Commit: eb019af9a9cadb127eab1b6d30312169ed90f808 Parents: cdce4e6 Author: Sebastien Rainville Authored: Wed May 4 14:32:36 2016 -0700 Committer: Andrew Or Committed: Wed May 4 14:32:36 2016 -0700 -- .../mesos/CoarseMesosSchedulerBackend.scala | 53 +--- .../cluster/mesos/MesosSchedulerUtils.scala | 4 ++ .../CoarseMesosSchedulerBackendSuite.scala | 13 + 3 files changed, 53 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eb019af9/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala index 50b452c..2c5be1f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala @@ -109,10 +109,14 @@ private[spark] class CoarseMesosSchedulerBackend( private val slaveOfferConstraints = parseConstraintString(sc.conf.get("spark.mesos.constraints", "")) - // reject offers with mismatched constraints in seconds + // Reject offers with mismatched constraints in seconds private val rejectOfferDurationForUnmetConstraints = getRejectOfferDurationForUnmetConstraints(sc) + // Reject offers when we reached the maximum number of cores for this framework + private val rejectOfferDurationForReachedMaxCores = +getRejectOfferDurationForReachedMaxCores(sc) + // A client for talking to the external shuffle service private val mesosExternalShuffleClient: Option[MesosExternalShuffleClient] = { if (shuffleServiceEnabled) { @@ -279,18 +283,32 @@ private[spark] class CoarseMesosSchedulerBackend( } private def declineUnmatchedOffers(d: SchedulerDriver, offers: Buffer[Offer]): Unit = { -for (offer <- offers) { - val id = offer.getId.getValue - val offerAttributes = toAttributeMap(offer.getAttributesList) - val mem = getResource(offer.getResourcesList, "mem") - val cpus = getResource(offer.getResourcesList, "cpus") - val filters = Filters.newBuilder() -.setRefuseSeconds(rejectOfferDurationForUnmetConstraints).build() - - logDebug(s"Declining offer: $id with attributes: $offerAttributes mem: $mem cpu: $cpus" -+ s" for $rejectOfferDurationForUnmetConstraints seconds") +offers.foreach { offer => + declineOffer(d, offer, Some("unmet constraints"), +Some(rejectOfferDurationForUnmetConstraints)) +} + } - d.declineOffer(offer.getId, filters) + private def declineOffer( + d: SchedulerDriver, + offer: Offer, + reason: Option[String] = None, + refuseSeconds: Option[Long] = None): Unit = { + +val id = offer.getId.getValue +val offerAttributes = toAttributeMap(offer.getAttributesList) +val mem = getResource(offer.getResourcesList, "mem") +val cpus = getResource(offer.getResourcesList, "cpus") + +logDebug(s"Declining offer: $id with attributes: $offerAttributes mem: $mem" + + s" cpu: $cpus for $refuseSeconds seconds" + + reason.map(r => s" (reason: $r)").getOrElse("")) + +refuseSeconds match { + case Some(seconds) => +val filters = Filters.newBuilder().setRefuseSeconds(seconds).build() +d.declineOffer(offer.getId, filters) + case _ => d.declineOffer(offer.getId) } } @@ -326,11 +344,12 @@ private[spark] class CoarseMesosSchedulerBackend(
spark git commit: [SPARK-15116] In REPL we should create SparkSession first and get SparkContext from it
Repository: spark Updated Branches: refs/heads/master eb019af9a -> a432a2b86 [SPARK-15116] In REPL we should create SparkSession first and get SparkContext from it ## What changes were proposed in this pull request? see https://github.com/apache/spark/pull/12873#discussion_r61993910. The problem is, if we create `SparkContext` first and then call `SparkSession.builder.enableHiveSupport().getOrCreate()`, we will reuse the existing `SparkContext` and the hive flag won't be set. ## How was this patch tested? verified it locally. Author: Wenchen FanCloses #12890 from cloud-fan/repl. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a432a2b8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a432a2b8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a432a2b8 Branch: refs/heads/master Commit: a432a2b86081a18cebf4085cead702436960f6c7 Parents: eb019af Author: Wenchen Fan Authored: Wed May 4 14:40:54 2016 -0700 Committer: Andrew Or Committed: Wed May 4 14:40:54 2016 -0700 -- .../org/apache/spark/repl/SparkILoop.scala | 20 ++- .../org/apache/spark/repl/SparkILoopInit.scala | 11 +++- .../main/scala/org/apache/spark/repl/Main.scala | 27 +--- .../org/apache/spark/repl/SparkILoop.scala | 11 +++- 4 files changed, 26 insertions(+), 43 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a432a2b8/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala -- diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala index c4f6450..b1e95d8 100644 --- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala +++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala @@ -1003,7 +1003,7 @@ class SparkILoop( // NOTE: Must be public for visibility @DeveloperApi - def createSparkContext(): SparkContext = { + def createSparkSession(): SparkSession = { val execUri = System.getenv("SPARK_EXECUTOR_URI") val jars = SparkILoop.getAddedJars val conf = new SparkConf() @@ -1019,22 +1019,18 @@ class SparkILoop( if (execUri != null) { conf.set("spark.executor.uri", execUri) } -sparkContext = new SparkContext(conf) -logInfo("Created spark context..") -Signaling.cancelOnInterrupt(sparkContext) -sparkContext - } - @DeveloperApi - // TODO: don't duplicate this code - def createSparkSession(): SparkSession = { -if (SparkSession.hiveClassesArePresent) { +val builder = SparkSession.builder.config(conf) +val sparkSession = if (SparkSession.hiveClassesArePresent) { logInfo("Creating Spark session with Hive support") - SparkSession.builder.enableHiveSupport().getOrCreate() + builder.enableHiveSupport().getOrCreate() } else { logInfo("Creating Spark session") - SparkSession.builder.getOrCreate() + builder.getOrCreate() } +sparkContext = sparkSession.sparkContext +Signaling.cancelOnInterrupt(sparkContext) +sparkSession } private def getMaster(): String = { http://git-wip-us.apache.org/repos/asf/spark/blob/a432a2b8/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala -- diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala index f1febb9..29f63de 100644 --- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala +++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala @@ -123,19 +123,14 @@ private[repl] trait SparkILoopInit { def initializeSpark() { intp.beQuietDuring { command(""" +@transient val spark = org.apache.spark.repl.Main.interp.createSparkSession() @transient val sc = { - val _sc = org.apache.spark.repl.Main.interp.createSparkContext() + val _sc = spark.sparkContext _sc.uiWebUrl.foreach(webUrl => println(s"Spark context Web UI available at ${webUrl}")) println("Spark context available as 'sc' " + s"(master = ${_sc.master}, app id = ${_sc.applicationId}).") - _sc -} -""") - command(""" -@transient val spark = { - val _session = org.apache.spark.repl.Main.interp.createSparkSession() println("Spark session available as 'spark'.") - _session + _sc } """) command("import org.apache.spark.SparkContext._")
[1/4] spark git commit: [SPARK-15031][EXAMPLE] Use SparkSession in Scala/Python/Java example.
Repository: spark Updated Branches: refs/heads/master cf2e9da61 -> cdce4e62a http://git-wip-us.apache.org/repos/asf/spark/blob/cdce4e62/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala index 77b913a..1b71a39 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.NGram // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object NGramExample { def main(args: Array[String]): Unit = { -val conf = new SparkConf().setAppName("NGramExample") -val sc = new SparkContext(conf) -val sqlContext = new SQLContext(sc) +val spark = SparkSession.builder.appName("NGramExample").getOrCreate() // $example on$ -val wordDataFrame = sqlContext.createDataFrame(Seq( +val wordDataFrame = spark.createDataFrame(Seq( (0, Array("Hi", "I", "heard", "about", "Spark")), (1, Array("I", "wish", "Java", "could", "use", "case", "classes")), (2, Array("Logistic", "regression", "models", "are", "neat")) @@ -41,7 +38,8 @@ object NGramExample { val ngramDataFrame = ngram.transform(wordDataFrame) ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println) // $example off$ -sc.stop() + +spark.stop() } } // scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/cdce4e62/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala index 5ea1270..8d54555 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala @@ -18,21 +18,18 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ -import org.apache.spark.ml.classification.{NaiveBayes} +import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object NaiveBayesExample { def main(args: Array[String]): Unit = { -val conf = new SparkConf().setAppName("NaiveBayesExample") -val sc = new SparkContext(conf) -val sqlContext = new SQLContext(sc) +val spark = SparkSession.builder.appName("NaiveBayesExample").getOrCreate() // $example on$ // Load the data stored in LIBSVM format as a DataFrame. -val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") +val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Split the data into training and test sets (30% held out for testing) val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) @@ -53,6 +50,8 @@ object NaiveBayesExample { val precision = evaluator.evaluate(predictions) println("Precision:" + precision) // $example off$ + +spark.stop() } } // scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/cdce4e62/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala index 6b33c16..4622d69 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.Normalizer // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object NormalizerExample { def main(args: Array[String]): Unit = { -val conf = new SparkConf().setAppName("NormalizerExample") -val sc = new SparkContext(conf) -val sqlContext = new SQLContext(sc) +val spark = SparkSession.builder.appName("NormalizerExample").getOrCreate() // $example on$ -val dataFrame =
[2/4] spark git commit: [SPARK-15031][EXAMPLE] Use SparkSession in Scala/Python/Java example.
http://git-wip-us.apache.org/repos/asf/spark/blob/cdce4e62/examples/src/main/python/ml/naive_bayes_example.py -- diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py index db8fbea..e370355 100644 --- a/examples/src/main/python/ml/naive_bayes_example.py +++ b/examples/src/main/python/ml/naive_bayes_example.py @@ -17,21 +17,18 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.classification import NaiveBayes from pyspark.ml.evaluation import MulticlassClassificationEvaluator # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - -sc = SparkContext(appName="naive_bayes_example") -sqlContext = SQLContext(sc) +spark = SparkSession.builder.appName("naive_bayes_example").getOrCreate() # $example on$ # Load training data -data = sqlContext.read.format("libsvm") \ +data = spark.read.format("libsvm") \ .load("data/mllib/sample_libsvm_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) @@ -50,4 +47,4 @@ if __name__ == "__main__": print("Precision:" + str(evaluator.evaluate(predictionAndLabels))) # $example off$ -sc.stop() +spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/cdce4e62/examples/src/main/python/ml/normalizer_example.py -- diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py index d490221..ae25537 100644 --- a/examples/src/main/python/ml/normalizer_example.py +++ b/examples/src/main/python/ml/normalizer_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import Normalizer # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": -sc = SparkContext(appName="NormalizerExample") -sqlContext = SQLContext(sc) +spark = SparkSession.builder.appName("NormalizerExample").getOrCreate() # $example on$ -dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") +dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) @@ -40,4 +38,4 @@ if __name__ == "__main__": lInfNormData.show() # $example off$ -sc.stop() +spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/cdce4e62/examples/src/main/python/ml/onehot_encoder_example.py -- diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py index 0f94c26..9acc363 100644 --- a/examples/src/main/python/ml/onehot_encoder_example.py +++ b/examples/src/main/python/ml/onehot_encoder_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import OneHotEncoder, StringIndexer # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": -sc = SparkContext(appName="OneHotEncoderExample") -sqlContext = SQLContext(sc) +spark = SparkSession.builder.appName("OneHotEncoderExample").getOrCreate() # $example on$ -df = sqlContext.createDataFrame([ +df = spark.createDataFrame([ (0, "a"), (1, "b"), (2, "c"), @@ -45,4 +43,4 @@ if __name__ == "__main__": encoded.select("id", "categoryVec").show() # $example off$ -sc.stop() +spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/cdce4e62/examples/src/main/python/ml/pca_example.py -- diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py index a17181f..adab151 100644 --- a/examples/src/main/python/ml/pca_example.py +++ b/examples/src/main/python/ml/pca_example.py @@ -17,26 +17,24 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import PCA from pyspark.mllib.linalg import Vectors # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": -sc = SparkContext(appName="PCAExample") -sqlContext = SQLContext(sc) +spark = SparkSession.builder.appName("PCAExample").getOrCreate() # $example on$ data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
[3/4] spark git commit: [SPARK-15031][EXAMPLE] Use SparkSession in Scala/Python/Java example.
http://git-wip-us.apache.org/repos/asf/spark/blob/cdce4e62/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java index 48fc3c8..e328454 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java @@ -17,15 +17,12 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.PolynomialExpansion; import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; @@ -39,9 +36,7 @@ import org.apache.spark.sql.types.StructType; public class JavaPolynomialExpansionExample { public static void main(String[] args) { -SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample"); -JavaSparkContext jsc = new JavaSparkContext(conf); -SQLContext jsql = new SQLContext(jsc); +SparkSession spark = SparkSession.builder().appName("JavaPolynomialExpansionExample").getOrCreate(); // $example on$ PolynomialExpansion polyExpansion = new PolynomialExpansion() @@ -49,17 +44,17 @@ public class JavaPolynomialExpansionExample { .setOutputCol("polyFeatures") .setDegree(3); -JavaRDD data = jsc.parallelize(Arrays.asList( +List data = Arrays.asList( RowFactory.create(Vectors.dense(-2.0, 2.3)), RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(0.6, -1.1)) -)); +); StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); -Dataset df = jsql.createDataFrame(data, schema); +Dataset df = spark.createDataFrame(data, schema); Dataset polyDF = polyExpansion.transform(df); List rows = polyDF.select("polyFeatures").takeAsList(3); @@ -67,6 +62,6 @@ public class JavaPolynomialExpansionExample { System.out.println(r.get(0)); } // $example off$ -jsc.stop(); +spark.stop(); } } http://git-wip-us.apache.org/repos/asf/spark/blob/cdce4e62/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java index 7b226fe..94e3faf 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java @@ -17,13 +17,11 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.QuantileDiscretizer; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -36,19 +34,16 @@ import org.apache.spark.sql.types.StructType; public class JavaQuantileDiscretizerExample { public static void main(String[] args) { -SparkConf conf = new SparkConf().setAppName("JavaQuantileDiscretizerExample"); -JavaSparkContext jsc = new JavaSparkContext(conf); -SQLContext sqlContext = new SQLContext(jsc); +SparkSession spark = SparkSession + .builder().appName("JavaQuantileDiscretizerExample").getOrCreate(); // $example on$ -JavaRDD jrdd = jsc.parallelize( - Arrays.asList( -RowFactory.create(0, 18.0), -RowFactory.create(1, 19.0), -RowFactory.create(2, 8.0), -RowFactory.create(3, 5.0), -RowFactory.create(4, 2.2) - ) +List data = Arrays.asList( + RowFactory.create(0, 18.0), + RowFactory.create(1, 19.0), + RowFactory.create(2, 8.0), + RowFactory.create(3, 5.0), + RowFactory.create(4, 2.2) ); StructType schema = new StructType(new StructField[]{ @@ -56,7 +51,7 @@ public class JavaQuantileDiscretizerExample { new StructField("hour", DataTypes.DoubleType, false, Metadata.empty()) }); -Dataset df = sqlContext.createDataFrame(jrdd, schema); +Dataset df = spark.createDataFrame(data, schema);
[2/4] spark git commit: [SPARK-15031][EXAMPLE] Use SparkSession in Scala/Python/Java example.
http://git-wip-us.apache.org/repos/asf/spark/blob/23789e35/examples/src/main/python/ml/naive_bayes_example.py -- diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py index db8fbea..e370355 100644 --- a/examples/src/main/python/ml/naive_bayes_example.py +++ b/examples/src/main/python/ml/naive_bayes_example.py @@ -17,21 +17,18 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.classification import NaiveBayes from pyspark.ml.evaluation import MulticlassClassificationEvaluator # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - -sc = SparkContext(appName="naive_bayes_example") -sqlContext = SQLContext(sc) +spark = SparkSession.builder.appName("naive_bayes_example").getOrCreate() # $example on$ # Load training data -data = sqlContext.read.format("libsvm") \ +data = spark.read.format("libsvm") \ .load("data/mllib/sample_libsvm_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) @@ -50,4 +47,4 @@ if __name__ == "__main__": print("Precision:" + str(evaluator.evaluate(predictionAndLabels))) # $example off$ -sc.stop() +spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/23789e35/examples/src/main/python/ml/normalizer_example.py -- diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py index d490221..ae25537 100644 --- a/examples/src/main/python/ml/normalizer_example.py +++ b/examples/src/main/python/ml/normalizer_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import Normalizer # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": -sc = SparkContext(appName="NormalizerExample") -sqlContext = SQLContext(sc) +spark = SparkSession.builder.appName("NormalizerExample").getOrCreate() # $example on$ -dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") +dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) @@ -40,4 +38,4 @@ if __name__ == "__main__": lInfNormData.show() # $example off$ -sc.stop() +spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/23789e35/examples/src/main/python/ml/onehot_encoder_example.py -- diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py index 0f94c26..9acc363 100644 --- a/examples/src/main/python/ml/onehot_encoder_example.py +++ b/examples/src/main/python/ml/onehot_encoder_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import OneHotEncoder, StringIndexer # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": -sc = SparkContext(appName="OneHotEncoderExample") -sqlContext = SQLContext(sc) +spark = SparkSession.builder.appName("OneHotEncoderExample").getOrCreate() # $example on$ -df = sqlContext.createDataFrame([ +df = spark.createDataFrame([ (0, "a"), (1, "b"), (2, "c"), @@ -45,4 +43,4 @@ if __name__ == "__main__": encoded.select("id", "categoryVec").show() # $example off$ -sc.stop() +spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/23789e35/examples/src/main/python/ml/pca_example.py -- diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py index a17181f..adab151 100644 --- a/examples/src/main/python/ml/pca_example.py +++ b/examples/src/main/python/ml/pca_example.py @@ -17,26 +17,24 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import PCA from pyspark.mllib.linalg import Vectors # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": -sc = SparkContext(appName="PCAExample") -sqlContext = SQLContext(sc) +spark = SparkSession.builder.appName("PCAExample").getOrCreate() # $example on$ data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
[4/4] spark git commit: [SPARK-15031][EXAMPLE] Use SparkSession in Scala/Python/Java example.
[SPARK-15031][EXAMPLE] Use SparkSession in Scala/Python/Java example. ## What changes were proposed in this pull request? This PR aims to update Scala/Python/Java examples by replacing `SQLContext` with newly added `SparkSession`. - Use **SparkSession Builder Pattern** in 154(Scala 55, Java 52, Python 47) files. - Add `getConf` in Python SparkContext class: `python/pyspark/context.py` - Replace **SQLContext Singleton Pattern** with **SparkSession Singleton Pattern**: - `SqlNetworkWordCount.scala` - `JavaSqlNetworkWordCount.java` - `sql_network_wordcount.py` Now, `SQLContexts` are used only in R examples and the following two Python examples. The python examples are untouched in this PR since it already fails some unknown issue. - `simple_params_example.py` - `aft_survival_regression.py` ## How was this patch tested? Manual. Author: Dongjoon HyunCloses #12809 from dongjoon-hyun/SPARK-15031. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cdce4e62 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cdce4e62 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cdce4e62 Branch: refs/heads/master Commit: cdce4e62a5674e2034e5d395578b1a60e3d8c435 Parents: cf2e9da Author: Dongjoon Hyun Authored: Wed May 4 14:31:36 2016 -0700 Committer: Andrew Or Committed: Wed May 4 14:31:36 2016 -0700 -- .../ml/JavaAFTSurvivalRegressionExample.java| 12 ++--- .../spark/examples/ml/JavaALSExample.java | 15 +++--- .../spark/examples/ml/JavaBinarizerExample.java | 15 +++--- .../examples/ml/JavaBisectingKMeansExample.java | 18 +++ .../examples/ml/JavaBucketizerExample.java | 18 +++ .../examples/ml/JavaChiSqSelectorExample.java | 15 +++--- .../examples/ml/JavaCountVectorizerExample.java | 19 +++- .../spark/examples/ml/JavaDCTExample.java | 15 +++--- .../JavaDecisionTreeClassificationExample.java | 13 ++ .../ml/JavaDecisionTreeRegressionExample.java | 13 ++ .../examples/ml/JavaDeveloperApiExample.java| 15 ++ .../ml/JavaElementwiseProductExample.java | 15 +++--- .../JavaEstimatorTransformerParamExample.java | 16 +++ ...avaGradientBoostedTreeClassifierExample.java | 11 ++--- ...JavaGradientBoostedTreeRegressorExample.java | 14 ++ .../examples/ml/JavaIndexToStringExample.java | 18 +++ .../spark/examples/ml/JavaKMeansExample.java| 14 ++ .../spark/examples/ml/JavaLDAExample.java | 14 ++ ...vaLinearRegressionWithElasticNetExample.java | 13 ++ .../JavaLogisticRegressionSummaryExample.java | 13 ++ ...LogisticRegressionWithElasticNetExample.java | 13 ++ .../examples/ml/JavaMaxAbsScalerExample.java| 12 ++--- .../examples/ml/JavaMinMaxScalerExample.java| 12 ++--- ...ModelSelectionViaCrossValidationExample.java | 16 +++ ...SelectionViaTrainValidationSplitExample.java | 14 ++ ...vaMultilayerPerceptronClassifierExample.java | 13 ++ .../spark/examples/ml/JavaNGramExample.java | 18 +++ .../examples/ml/JavaNaiveBayesExample.java | 14 ++ .../examples/ml/JavaNormalizerExample.java | 13 ++ .../examples/ml/JavaOneHotEncoderExample.java | 18 +++ .../spark/examples/ml/JavaOneVsRestExample.java | 14 ++ .../spark/examples/ml/JavaPCAExample.java | 18 +++ .../spark/examples/ml/JavaPipelineExample.java | 16 ++- .../ml/JavaPolynomialExpansionExample.java | 17 +++ .../ml/JavaQuantileDiscretizerExample.java | 29 +--- .../spark/examples/ml/JavaRFormulaExample.java | 18 +++ .../ml/JavaRandomForestClassifierExample.java | 14 ++ .../ml/JavaRandomForestRegressorExample.java| 14 ++ .../examples/ml/JavaSQLTransformerExample.java | 19 +++- .../examples/ml/JavaSimpleParamsExample.java| 14 ++ .../JavaSimpleTextClassificationPipeline.java | 15 +++--- .../examples/ml/JavaStandardScalerExample.java | 13 ++ .../ml/JavaStopWordsRemoverExample.java | 18 +++ .../examples/ml/JavaStringIndexerExample.java | 18 +++ .../spark/examples/ml/JavaTfIdfExample.java | 18 +++ .../spark/examples/ml/JavaTokenizerExample.java | 18 +++ .../examples/ml/JavaVectorAssemblerExample.java | 14 ++ .../examples/ml/JavaVectorIndexerExample.java | 12 ++--- .../examples/ml/JavaVectorSlicerExample.java| 19 .../spark/examples/ml/JavaWord2VecExample.java | 19 +++- .../apache/spark/examples/sql/JavaSparkSQL.java | 33 ++--- .../streaming/JavaSqlNetworkWordCount.java | 19 examples/src/main/python/ml/als_example.py | 14 +++--- .../src/main/python/ml/binarizer_example.py | 10 ++-- .../main/python/ml/bisecting_k_means_example.py | 16 +++ .../src/main/python/ml/bucketizer_example.py| 10 ++--
[1/4] spark git commit: [SPARK-15031][EXAMPLE] Use SparkSession in Scala/Python/Java example.
Repository: spark Updated Branches: refs/heads/branch-2.0 c0715f33b -> 23789e358 http://git-wip-us.apache.org/repos/asf/spark/blob/23789e35/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala index 77b913a..1b71a39 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.NGram // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object NGramExample { def main(args: Array[String]): Unit = { -val conf = new SparkConf().setAppName("NGramExample") -val sc = new SparkContext(conf) -val sqlContext = new SQLContext(sc) +val spark = SparkSession.builder.appName("NGramExample").getOrCreate() // $example on$ -val wordDataFrame = sqlContext.createDataFrame(Seq( +val wordDataFrame = spark.createDataFrame(Seq( (0, Array("Hi", "I", "heard", "about", "Spark")), (1, Array("I", "wish", "Java", "could", "use", "case", "classes")), (2, Array("Logistic", "regression", "models", "are", "neat")) @@ -41,7 +38,8 @@ object NGramExample { val ngramDataFrame = ngram.transform(wordDataFrame) ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println) // $example off$ -sc.stop() + +spark.stop() } } // scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/23789e35/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala index 5ea1270..8d54555 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala @@ -18,21 +18,18 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ -import org.apache.spark.ml.classification.{NaiveBayes} +import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object NaiveBayesExample { def main(args: Array[String]): Unit = { -val conf = new SparkConf().setAppName("NaiveBayesExample") -val sc = new SparkContext(conf) -val sqlContext = new SQLContext(sc) +val spark = SparkSession.builder.appName("NaiveBayesExample").getOrCreate() // $example on$ // Load the data stored in LIBSVM format as a DataFrame. -val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") +val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Split the data into training and test sets (30% held out for testing) val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) @@ -53,6 +50,8 @@ object NaiveBayesExample { val precision = evaluator.evaluate(predictions) println("Precision:" + precision) // $example off$ + +spark.stop() } } // scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/23789e35/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala index 6b33c16..4622d69 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.Normalizer // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object NormalizerExample { def main(args: Array[String]): Unit = { -val conf = new SparkConf().setAppName("NormalizerExample") -val sc = new SparkContext(conf) -val sqlContext = new SQLContext(sc) +val spark = SparkSession.builder.appName("NormalizerExample").getOrCreate() // $example on$ -val dataFrame =
[4/4] spark git commit: [SPARK-15031][EXAMPLE] Use SparkSession in Scala/Python/Java example.
[SPARK-15031][EXAMPLE] Use SparkSession in Scala/Python/Java example. ## What changes were proposed in this pull request? This PR aims to update Scala/Python/Java examples by replacing `SQLContext` with newly added `SparkSession`. - Use **SparkSession Builder Pattern** in 154(Scala 55, Java 52, Python 47) files. - Add `getConf` in Python SparkContext class: `python/pyspark/context.py` - Replace **SQLContext Singleton Pattern** with **SparkSession Singleton Pattern**: - `SqlNetworkWordCount.scala` - `JavaSqlNetworkWordCount.java` - `sql_network_wordcount.py` Now, `SQLContexts` are used only in R examples and the following two Python examples. The python examples are untouched in this PR since it already fails some unknown issue. - `simple_params_example.py` - `aft_survival_regression.py` ## How was this patch tested? Manual. Author: Dongjoon HyunCloses #12809 from dongjoon-hyun/SPARK-15031. (cherry picked from commit cdce4e62a5674e2034e5d395578b1a60e3d8c435) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/23789e35 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/23789e35 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/23789e35 Branch: refs/heads/branch-2.0 Commit: 23789e358589505121113d504adee97e56c00929 Parents: c0715f3 Author: Dongjoon Hyun Authored: Wed May 4 14:31:36 2016 -0700 Committer: Andrew Or Committed: Wed May 4 14:31:47 2016 -0700 -- .../ml/JavaAFTSurvivalRegressionExample.java| 12 ++--- .../spark/examples/ml/JavaALSExample.java | 15 +++--- .../spark/examples/ml/JavaBinarizerExample.java | 15 +++--- .../examples/ml/JavaBisectingKMeansExample.java | 18 +++ .../examples/ml/JavaBucketizerExample.java | 18 +++ .../examples/ml/JavaChiSqSelectorExample.java | 15 +++--- .../examples/ml/JavaCountVectorizerExample.java | 19 +++- .../spark/examples/ml/JavaDCTExample.java | 15 +++--- .../JavaDecisionTreeClassificationExample.java | 13 ++ .../ml/JavaDecisionTreeRegressionExample.java | 13 ++ .../examples/ml/JavaDeveloperApiExample.java| 15 ++ .../ml/JavaElementwiseProductExample.java | 15 +++--- .../JavaEstimatorTransformerParamExample.java | 16 +++ ...avaGradientBoostedTreeClassifierExample.java | 11 ++--- ...JavaGradientBoostedTreeRegressorExample.java | 14 ++ .../examples/ml/JavaIndexToStringExample.java | 18 +++ .../spark/examples/ml/JavaKMeansExample.java| 14 ++ .../spark/examples/ml/JavaLDAExample.java | 14 ++ ...vaLinearRegressionWithElasticNetExample.java | 13 ++ .../JavaLogisticRegressionSummaryExample.java | 13 ++ ...LogisticRegressionWithElasticNetExample.java | 13 ++ .../examples/ml/JavaMaxAbsScalerExample.java| 12 ++--- .../examples/ml/JavaMinMaxScalerExample.java| 12 ++--- ...ModelSelectionViaCrossValidationExample.java | 16 +++ ...SelectionViaTrainValidationSplitExample.java | 14 ++ ...vaMultilayerPerceptronClassifierExample.java | 13 ++ .../spark/examples/ml/JavaNGramExample.java | 18 +++ .../examples/ml/JavaNaiveBayesExample.java | 14 ++ .../examples/ml/JavaNormalizerExample.java | 13 ++ .../examples/ml/JavaOneHotEncoderExample.java | 18 +++ .../spark/examples/ml/JavaOneVsRestExample.java | 14 ++ .../spark/examples/ml/JavaPCAExample.java | 18 +++ .../spark/examples/ml/JavaPipelineExample.java | 16 ++- .../ml/JavaPolynomialExpansionExample.java | 17 +++ .../ml/JavaQuantileDiscretizerExample.java | 29 +--- .../spark/examples/ml/JavaRFormulaExample.java | 18 +++ .../ml/JavaRandomForestClassifierExample.java | 14 ++ .../ml/JavaRandomForestRegressorExample.java| 14 ++ .../examples/ml/JavaSQLTransformerExample.java | 19 +++- .../examples/ml/JavaSimpleParamsExample.java| 14 ++ .../JavaSimpleTextClassificationPipeline.java | 15 +++--- .../examples/ml/JavaStandardScalerExample.java | 13 ++ .../ml/JavaStopWordsRemoverExample.java | 18 +++ .../examples/ml/JavaStringIndexerExample.java | 18 +++ .../spark/examples/ml/JavaTfIdfExample.java | 18 +++ .../spark/examples/ml/JavaTokenizerExample.java | 18 +++ .../examples/ml/JavaVectorAssemblerExample.java | 14 ++ .../examples/ml/JavaVectorIndexerExample.java | 12 ++--- .../examples/ml/JavaVectorSlicerExample.java| 19 .../spark/examples/ml/JavaWord2VecExample.java | 19 +++- .../apache/spark/examples/sql/JavaSparkSQL.java | 33 ++--- .../streaming/JavaSqlNetworkWordCount.java | 19 examples/src/main/python/ml/als_example.py | 14 +++--- .../src/main/python/ml/binarizer_example.py | 10 ++--
[3/4] spark git commit: [SPARK-15031][EXAMPLE] Use SparkSession in Scala/Python/Java example.
http://git-wip-us.apache.org/repos/asf/spark/blob/23789e35/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java index 48fc3c8..e328454 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java @@ -17,15 +17,12 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.PolynomialExpansion; import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; @@ -39,9 +36,7 @@ import org.apache.spark.sql.types.StructType; public class JavaPolynomialExpansionExample { public static void main(String[] args) { -SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample"); -JavaSparkContext jsc = new JavaSparkContext(conf); -SQLContext jsql = new SQLContext(jsc); +SparkSession spark = SparkSession.builder().appName("JavaPolynomialExpansionExample").getOrCreate(); // $example on$ PolynomialExpansion polyExpansion = new PolynomialExpansion() @@ -49,17 +44,17 @@ public class JavaPolynomialExpansionExample { .setOutputCol("polyFeatures") .setDegree(3); -JavaRDD data = jsc.parallelize(Arrays.asList( +List data = Arrays.asList( RowFactory.create(Vectors.dense(-2.0, 2.3)), RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(0.6, -1.1)) -)); +); StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); -Dataset df = jsql.createDataFrame(data, schema); +Dataset df = spark.createDataFrame(data, schema); Dataset polyDF = polyExpansion.transform(df); List rows = polyDF.select("polyFeatures").takeAsList(3); @@ -67,6 +62,6 @@ public class JavaPolynomialExpansionExample { System.out.println(r.get(0)); } // $example off$ -jsc.stop(); +spark.stop(); } } http://git-wip-us.apache.org/repos/asf/spark/blob/23789e35/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java index 7b226fe..94e3faf 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java @@ -17,13 +17,11 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.QuantileDiscretizer; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -36,19 +34,16 @@ import org.apache.spark.sql.types.StructType; public class JavaQuantileDiscretizerExample { public static void main(String[] args) { -SparkConf conf = new SparkConf().setAppName("JavaQuantileDiscretizerExample"); -JavaSparkContext jsc = new JavaSparkContext(conf); -SQLContext sqlContext = new SQLContext(jsc); +SparkSession spark = SparkSession + .builder().appName("JavaQuantileDiscretizerExample").getOrCreate(); // $example on$ -JavaRDD jrdd = jsc.parallelize( - Arrays.asList( -RowFactory.create(0, 18.0), -RowFactory.create(1, 19.0), -RowFactory.create(2, 8.0), -RowFactory.create(3, 5.0), -RowFactory.create(4, 2.2) - ) +List data = Arrays.asList( + RowFactory.create(0, 18.0), + RowFactory.create(1, 19.0), + RowFactory.create(2, 8.0), + RowFactory.create(3, 5.0), + RowFactory.create(4, 2.2) ); StructType schema = new StructType(new StructField[]{ @@ -56,7 +51,7 @@ public class JavaQuantileDiscretizerExample { new StructField("hour", DataTypes.DoubleType, false, Metadata.empty()) }); -Dataset df = sqlContext.createDataFrame(jrdd, schema); +Dataset df = spark.createDataFrame(data, schema);
spark git commit: [SPARK-14592][SQL] Native support for CREATE TABLE LIKE DDL command
Repository: spark Updated Branches: refs/heads/master c971aee40 -> 28efdd3fd [SPARK-14592][SQL] Native support for CREATE TABLE LIKE DDL command ## What changes were proposed in this pull request? JIRA: https://issues.apache.org/jira/browse/SPARK-14592 This patch adds native support for DDL command `CREATE TABLE LIKE`. The SQL syntax is like: CREATE TABLE table_name LIKE existing_table CREATE TABLE IF NOT EXISTS table_name LIKE existing_table ## How was this patch tested? `HiveDDLCommandSuite`. `HiveQuerySuite` already tests `CREATE TABLE LIKE`. Author: Liang-Chi HsiehThis patch had conflicts when merged, resolved by Committer: Andrew Or Closes #12362 from viirya/create-table-like. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/28efdd3f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/28efdd3f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/28efdd3f Branch: refs/heads/master Commit: 28efdd3fd789fa2ebed5be03b36ca0f682e37669 Parents: c971aee Author: Liang-Chi Hsieh Authored: Thu Apr 14 11:08:08 2016 -0700 Committer: Andrew Or Committed: Thu Apr 14 11:08:08 2016 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 7 ++-- .../spark/sql/execution/command/tables.scala| 40 +++- .../hive/execution/HiveCompatibilitySuite.scala | 4 +- .../sql/hive/execution/HiveSqlParser.scala | 13 ++- .../spark/sql/hive/HiveDDLCommandSuite.scala| 24 +++- 5 files changed, 79 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/28efdd3f/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index a937ad1..9cf2dd2 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -55,6 +55,8 @@ statement rowFormat? createFileFormat? locationSpec? (TBLPROPERTIES tablePropertyList)? (AS? query)? #createTable +| CREATE TABLE (IF NOT EXISTS)? target=tableIdentifier +LIKE source=tableIdentifier #createTableLike | ANALYZE TABLE tableIdentifier partitionSpec? COMPUTE STATISTICS (identifier | FOR COLUMNS identifierSeq?)? #analyze | ALTER (TABLE | VIEW) from=tableIdentifier @@ -136,10 +138,7 @@ statement ; hiveNativeCommands -: createTableHeader LIKE tableIdentifier -rowFormat? createFileFormat? locationSpec? -(TBLPROPERTIES tablePropertyList)? -| DELETE FROM tableIdentifier (WHERE booleanExpression)? +: DELETE FROM tableIdentifier (WHERE booleanExpression)? | TRUNCATE TABLE tableIdentifier partitionSpec? (COLUMNS identifierList)? | SHOW COLUMNS (FROM | IN) tableIdentifier ((FROM|IN) identifier)? http://git-wip-us.apache.org/repos/asf/spark/blob/28efdd3f/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index e315598..0b41985 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -17,9 +17,45 @@ package org.apache.spark.sql.execution.command -import org.apache.spark.sql.{Row, SQLContext} +import org.apache.spark.sql.{AnalysisException, Row, SQLContext} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} + +/** + * A command to create a table with the same definition of the given existing table. + * + * The syntax of using this command in SQL is: + * {{{ + * CREATE TABLE [IF NOT EXISTS] [db_name.]table_name + * LIKE [other_db_name.]existing_table_name + * }}} + */ +case class CreateTableLike( +targetTable: TableIdentifier, +sourceTable: TableIdentifier, +ifNotExists: Boolean) extends RunnableCommand { + + override def run(sqlContext: SQLContext): Seq[Row] = { +val catalog = sqlContext.sessionState.catalog +if (!catalog.tableExists(sourceTable)) { + throw new AnalysisException( +
spark git commit: Revert "[SPARK-14647][SQL] Group SQLContext/HiveContext state into SharedState"
Repository: spark Updated Branches: refs/heads/master 699a4dfd8 -> 7de06a646 Revert "[SPARK-14647][SQL] Group SQLContext/HiveContext state into SharedState" This reverts commit 5cefecc95a5b8418713516802c416cfde5a94a2d. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7de06a64 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7de06a64 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7de06a64 Branch: refs/heads/master Commit: 7de06a646dff7ede520d2e982ac0996d8c184650 Parents: 699a4df Author: Andrew OrAuthored: Sun Apr 17 17:35:41 2016 -0700 Committer: Andrew Or Committed: Sun Apr 17 17:35:41 2016 -0700 -- .../scala/org/apache/spark/sql/SQLContext.scala | 31 +++ .../spark/sql/internal/SessionState.scala | 2 + .../apache/spark/sql/internal/SharedState.scala | 47 --- .../org/apache/spark/sql/hive/HiveContext.scala | 51 .../spark/sql/hive/HiveSessionState.scala | 15 +--- .../apache/spark/sql/hive/HiveSharedState.scala | 53 .../apache/spark/sql/hive/test/TestHive.scala | 86 +--- .../sql/hive/HiveExternalCatalogSuite.scala | 12 ++- 8 files changed, 122 insertions(+), 175 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7de06a64/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 781d699..9259ff4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -41,7 +41,7 @@ import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.command.ShowTablesCommand import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.ui.{SQLListener, SQLTab} -import org.apache.spark.sql.internal.{SessionState, SharedState, SQLConf} +import org.apache.spark.sql.internal.{SessionState, SQLConf} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types._ import org.apache.spark.sql.util.ExecutionListenerManager @@ -63,14 +63,17 @@ import org.apache.spark.util.Utils * @since 1.0.0 */ class SQLContext private[sql]( -@transient protected[sql] val sharedState: SharedState, -val isRootContext: Boolean) +@transient val sparkContext: SparkContext, +@transient protected[sql] val cacheManager: CacheManager, +@transient private[sql] val listener: SQLListener, +val isRootContext: Boolean, +@transient private[sql] val externalCatalog: ExternalCatalog) extends Logging with Serializable { self => def this(sc: SparkContext) = { -this(new SharedState(sc), true) +this(sc, new CacheManager, SQLContext.createListenerAndUI(sc), true, new InMemoryCatalog) } def this(sparkContext: JavaSparkContext) = this(sparkContext.sc) @@ -97,20 +100,20 @@ class SQLContext private[sql]( } } - def sparkContext: SparkContext = sharedState.sparkContext - - protected[sql] def cacheManager: CacheManager = sharedState.cacheManager - protected[sql] def listener: SQLListener = sharedState.listener - protected[sql] def externalCatalog: ExternalCatalog = sharedState.externalCatalog - /** - * Returns a [[SQLContext]] as new session, with separated SQL configurations, temporary - * tables, registered functions, but sharing the same [[SparkContext]], cached data and - * other things. + * Returns a SQLContext as new session, with separated SQL configurations, temporary tables, + * registered functions, but sharing the same SparkContext, CacheManager, SQLListener and SQLTab. * * @since 1.6.0 */ - def newSession(): SQLContext = new SQLContext(sharedState, isRootContext = false) + def newSession(): SQLContext = { +new SQLContext( + sparkContext = sparkContext, + cacheManager = cacheManager, + listener = listener, + isRootContext = false, + externalCatalog = externalCatalog) + } /** * Per-session state, e.g. configuration, functions, temporary tables etc. http://git-wip-us.apache.org/repos/asf/spark/blob/7de06a64/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala index d404a7c..c30f879 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala +++
spark git commit: [SPARK-14619] Track internal accumulators (metrics) by stage attempt
Repository: spark Updated Branches: refs/heads/master 9fa43a33b -> dac40b68d [SPARK-14619] Track internal accumulators (metrics) by stage attempt ## What changes were proposed in this pull request? When there are multiple attempts for a stage, we currently only reset internal accumulator values if all the tasks are resubmitted. It would make more sense to reset the accumulator values for each stage attempt. This will allow us to eventually get rid of the internal flag in the Accumulator class. This is part of my bigger effort to simplify accumulators and task metrics. ## How was this patch tested? Covered by existing tests. Author: Reynold XinCloses #12378 from rxin/SPARK-14619. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dac40b68 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dac40b68 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dac40b68 Branch: refs/heads/master Commit: dac40b68dc52d5ab855dfde63f0872064aa3d999 Parents: 9fa43a3 Author: Reynold Xin Authored: Thu Apr 14 10:54:57 2016 -0700 Committer: Andrew Or Committed: Thu Apr 14 10:54:57 2016 -0700 -- .../org/apache/spark/InternalAccumulator.scala | 2 +- .../apache/spark/scheduler/DAGScheduler.scala| 11 ++- .../scala/org/apache/spark/scheduler/Stage.scala | 19 ++- .../org/apache/spark/scheduler/StageInfo.scala | 10 +- .../scala/org/apache/spark/ui/jobs/JobPage.scala | 2 +- .../org/apache/spark/util/JsonProtocol.scala | 6 -- .../spark/ExecutorAllocationManagerSuite.scala | 4 ++-- .../scala/org/apache/spark/ShuffleSuite.scala| 6 +++--- .../spark/scheduler/DAGSchedulerSuite.scala | 2 +- .../sql/execution/UnsafeRowSerializerSuite.scala | 2 +- 10 files changed, 26 insertions(+), 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dac40b68/core/src/main/scala/org/apache/spark/InternalAccumulator.scala -- diff --git a/core/src/main/scala/org/apache/spark/InternalAccumulator.scala b/core/src/main/scala/org/apache/spark/InternalAccumulator.scala index 7aa9057..0dd4ec6 100644 --- a/core/src/main/scala/org/apache/spark/InternalAccumulator.scala +++ b/core/src/main/scala/org/apache/spark/InternalAccumulator.scala @@ -187,7 +187,7 @@ private[spark] object InternalAccumulator { * add to the same set of accumulators. We do this to report the distribution of accumulator * values across all tasks within each stage. */ - def create(sc: SparkContext): Seq[Accumulator[_]] = { + def createAll(sc: SparkContext): Seq[Accumulator[_]] = { val accums = createAll() accums.foreach { accum => Accumulators.register(accum) http://git-wip-us.apache.org/repos/asf/spark/blob/dac40b68/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 4609b24..c27aad2 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -950,13 +950,6 @@ class DAGScheduler( // First figure out the indexes of partition ids to compute. val partitionsToCompute: Seq[Int] = stage.findMissingPartitions() -// Create internal accumulators if the stage has no accumulators initialized. -// Reset internal accumulators only if this stage is not partially submitted -// Otherwise, we may override existing accumulator values from some tasks -if (stage.internalAccumulators.isEmpty || stage.numPartitions == partitionsToCompute.size) { - stage.resetInternalAccumulators() -} - // Use the scheduling pool, job group, description, etc. from an ActiveJob associated // with this Stage val properties = jobIdToActiveJob(jobId).properties @@ -1036,7 +1029,7 @@ class DAGScheduler( val locs = taskIdToLocations(id) val part = stage.rdd.partitions(id) new ShuffleMapTask(stage.id, stage.latestInfo.attemptId, - taskBinary, part, locs, stage.internalAccumulators, properties) + taskBinary, part, locs, stage.latestInfo.internalAccumulators, properties) } case stage: ResultStage => @@ -1046,7 +1039,7 @@ class DAGScheduler( val part = stage.rdd.partitions(p) val locs = taskIdToLocations(id) new ResultTask(stage.id, stage.latestInfo.attemptId, - taskBinary, part, locs, id, properties, stage.internalAccumulators) +