Github user wangmiao1981 commented on a diff in the pull request:
https://github.com/apache/spark/pull/12402#discussion_r60694336
--- Diff:
mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala ---
@@ -104,6 +105,17 @@ class GaussianMixtureModel private[ml] (
@Since("2.0.0")
def gaussians: Array[MultivariateGaussian] = parentModel.gaussians
+ /**
+ * Helper method used in Python
+ */
+ def gaussiansDF: DataFrame = {
--- End diff --
@jkbradley
After changing to the above lines, I got some errors. It seems that I can't
use SparkContext to get sqlContext.
Errors:
execute, tree:
Exchange hashpartitioning(gm_prediction#3, 200), None
+- WholeStageCodegen
: +- TungstenAggregate(key=[gm_prediction#3],
functions=[(count(1),mode=Partial,isDistinct=false)],
output=[gm_prediction#3,count#32L])
: +- Project [UDF(features#0) AS gm_prediction#3]
: +- INPUT
+- Scan ExistingRDD[features#0]
org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute,
tree:
Exchange hashpartitioning(gm_prediction#3, 200), None
+- WholeStageCodegen
: +- TungstenAggregate(key=[gm_prediction#3],
functions=[(count(1),mode=Partial,isDistinct=false)],
output=[gm_prediction#3,count#32L])
: +- Project [UDF(features#0) AS gm_prediction#3]
: +- INPUT
+- Scan ExistingRDD[features#0]
at
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:50)
at
org.apache.spark.sql.execution.exchange.ShuffleExchange.doExecute(ShuffleExchange.scala:109)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:137)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:134)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:117)
at
org.apache.spark.sql.execution.InputAdapter.upstreams(WholeStageCodegen.scala:237)
at
org.apache.spark.sql.execution.aggregate.TungstenAggregate.upstreams(TungstenAggregate.scala:131)
at
org.apache.spark.sql.execution.WholeStageCodegen.doExecute(WholeStageCodegen.scala:352)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:137)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:134)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:117)
at
org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:230)
at
org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:277)
at
org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2099)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:53)
at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2386)
at
org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2098)
at
org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2103)
at
org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2103)
at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2399)
at
org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2103)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:2079)
at
org.apache.spark.ml.clustering.GaussianMixtureSummary.clusterSizes$lzycompute(GaussianMixture.scala:327)
at
org.apache.spark.ml.clustering.GaussianMixtureSummary.clusterSizes(GaussianMixture.scala:325)
at
org.apache.spark.ml.clustering.GaussianMixtureSuite$$anonfun$4.apply$mcV$sp(GaussianMixtureSuite.scala:102)
at
org.apache.spark.ml.clustering.GaussianMixtureSuite$$anonfun$4.apply(GaussianMixtureSuite.scala:73)
at
org.apache.spark.ml.clustering.GaussianMixtureSuite$$anonfun$4.apply(GaussianMixtureSuite.scala:73)
at
org.scalatest.Transformer$$anonfun$apply$1.apply$mcV$sp(Transformer.scala:22)
at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85)
at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
at org.scalatest.Transformer.apply(Transformer.scala:22)
at org.scalatest.Transformer.apply(Transformer.scala:20)
at org.scalatest.FunSuiteLike$$anon$1.apply(FunSuiteLike.scala:166)
at org.apache.spark.SparkFunSuite.withFixture(SparkFunSuite.scala:56)
at
org.scalatest.FunSuiteLike$class.invokeWithFixture$1(FunSuiteLike.scala:163)
at
org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:175)
at
org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:175)
at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306)
at org.scalatest.FunSuiteLike$class.runTest(FunSuiteLike.scala:175)
at org.scalatest.FunSuite.runTest(FunSuite.scala:1555)
at
org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:208)
at
org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:208)
at
org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:413)
at
org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:401)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401)
at
org.scalatest.SuperEngine.org$scalatest$SuperEngine$$runTestsInBranch(Engine.scala:396)
at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:483)
at org.scalatest.FunSuiteLike$class.runTests(FunSuiteLike.scala:208)
at org.scalatest.FunSuite.runTests(FunSuite.scala:1555)
at org.scalatest.Suite$class.run(Suite.scala:1424)
at
org.scalatest.FunSuite.org$scalatest$FunSuiteLike$$super$run(FunSuite.scala:1555)
at
org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:212)
at
org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:212)
at org.scalatest.SuperEngine.runImpl(Engine.scala:545)
at org.scalatest.FunSuiteLike$class.run(FunSuiteLike.scala:212)
at
org.apache.spark.SparkFunSuite.org$scalatest$BeforeAndAfterAll$$super$run(SparkFunSuite.scala:28)
at
org.scalatest.BeforeAndAfterAll$class.liftedTree1$1(BeforeAndAfterAll.scala:257)
at
org.scalatest.BeforeAndAfterAll$class.run(BeforeAndAfterAll.scala:256)
at org.apache.spark.SparkFunSuite.run(SparkFunSuite.scala:28)
at org.scalatest.tools.SuiteRunner.run(SuiteRunner.scala:55)
at
org.scalatest.tools.Runner$$anonfun$doRunRunRunDaDoRunRun$3.apply(Runner.scala:2563)
at
org.scalatest.tools.Runner$$anonfun$doRunRunRunDaDoRunRun$3.apply(Runner.scala:2557)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.scalatest.tools.Runner$.doRunRunRunDaDoRunRun(Runner.scala:2557)
at
org.scalatest.tools.Runner$$anonfun$runOptionallyWithPassFailReporter$2.apply(Runner.scala:1044)
at
org.scalatest.tools.Runner$$anonfun$runOptionallyWithPassFailReporter$2.apply(Runner.scala:1043)
at
org.scalatest.tools.Runner$.withClassLoaderAndDispatchReporter(Runner.scala:2722)
at
org.scalatest.tools.Runner$.runOptionallyWithPassFailReporter(Runner.scala:1043)
at org.scalatest.tools.Runner$.run(Runner.scala:883)
at org.scalatest.tools.Runner.run(Runner.scala)
at
org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.runScalaTest2(ScalaTestRunner.java:138)
at
org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.main(ScalaTestRunner.java:28)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)
Caused by: org.apache.spark.SparkException: Task not serializable
at
org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:305)
at
org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:295)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:123)
at org.apache.spark.SparkContext.clean(SparkContext.scala:1944)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:782)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:781)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:357)
at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:781)
at
org.apache.spark.sql.execution.WholeStageCodegen.doExecute(WholeStageCodegen.scala:355)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:137)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:134)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:117)
at
org.apache.spark.sql.execution.exchange.ShuffleExchange.prepareShuffleDependency(ShuffleExchange.scala:82)
at
org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:118)
at
org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:109)
at
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49)
... 83 more
Caused by: java.io.NotSerializableException: org.apache.spark.SparkContext
Serialization stack:
- object not serializable (class: org.apache.spark.SparkContext, value:
org.apache.spark.SparkContext@6eaadd1d)
- field (class: org.apache.spark.ml.clustering.GaussianMixtureModel,
name: sc, type: class org.apache.spark.SparkContext)
- object (class org.apache.spark.ml.clustering.GaussianMixtureModel,
GaussianMixture_0e35e80dd8f8)
- field (class:
org.apache.spark.ml.clustering.GaussianMixtureModel$$anonfun$3, name: $outer,
type: class org.apache.spark.ml.clustering.GaussianMixtureModel)
- object (class
org.apache.spark.ml.clustering.GaussianMixtureModel$$anonfun$3, <function1>)
- field (class:
org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2, name: func$2,
type: interface scala.Function1)
- object (class
org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2, <function1>)
- field (class: org.apache.spark.sql.catalyst.expressions.ScalaUDF,
name: f, type: interface scala.Function1)
- object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF,
UDF(features#0))
- field (class: org.apache.spark.sql.catalyst.expressions.Alias, name:
child, type: class org.apache.spark.sql.catalyst.expressions.Expression)
- object (class org.apache.spark.sql.catalyst.expressions.Alias,
UDF(features#0) AS gm_prediction#3)
- element of array (index: 0)
- array (class [Ljava.lang.Object;, size 1)
- field (class: scala.collection.mutable.ArrayBuffer, name: array,
type: class [Ljava.lang.Object;)
- object (class scala.collection.mutable.ArrayBuffer,
ArrayBuffer(UDF(features#0) AS gm_prediction#3))
- field (class: org.apache.spark.sql.execution.Project, name:
projectList, type: interface scala.collection.Seq)
- object (class org.apache.spark.sql.execution.Project, Project
[UDF(features#0) AS gm_prediction#3]
+- INPUT
)
- field (class:
org.apache.spark.sql.execution.aggregate.TungstenAggregate, name: child, type:
class org.apache.spark.sql.execution.SparkPlan)
- object (class
org.apache.spark.sql.execution.aggregate.TungstenAggregate,
TungstenAggregate(key=[gm_prediction#3],
functions=[(count(1),mode=Partial,isDistinct=false)],
output=[gm_prediction#3,count#32L])
+- Project [UDF(features#0) AS gm_prediction#3]
+- INPUT
)
- element of array (index: 0)
- array (class [Ljava.lang.Object;, size 3)
- field (class:
org.apache.spark.sql.execution.WholeStageCodegen$$anonfun$6, name:
references$1, type: class [Ljava.lang.Object;)
- object (class
org.apache.spark.sql.execution.WholeStageCodegen$$anonfun$6, <function2>)
at
org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at
org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at
org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at
org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:302)
... 103 more
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]