[GitHub] spark pull request: [SPARK-14433][PySpark][ML]:PySpark ml Gaussian...

wangmiao1981 Thu, 21 Apr 2016 23:00:03 -0700

Github user wangmiao1981 commented on a diff in the pull request:

    https://github.com/apache/spark/pull/12402#discussion_r60694336
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala ---
    @@ -104,6 +105,17 @@ class GaussianMixtureModel private[ml] (
       @Since("2.0.0")
       def gaussians: Array[MultivariateGaussian] = parentModel.gaussians
     
    +  /**
    +   * Helper method used in Python
    +   */
    +  def gaussiansDF: DataFrame = {
    --- End diff --
    
    @jkbradley 
    After changing to the above lines, I got some errors. It seems that I can't 
use SparkContext to get sqlContext.
    
    Errors:
    
    execute, tree:
    Exchange hashpartitioning(gm_prediction#3, 200), None
    +- WholeStageCodegen
       :  +- TungstenAggregate(key=[gm_prediction#3], 
functions=[(count(1),mode=Partial,isDistinct=false)], 
output=[gm_prediction#3,count#32L])
       :     +- Project [UDF(features#0) AS gm_prediction#3]
       :        +- INPUT
       +- Scan ExistingRDD[features#0]
    
    org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, 
tree:
    Exchange hashpartitioning(gm_prediction#3, 200), None
    +- WholeStageCodegen
       :  +- TungstenAggregate(key=[gm_prediction#3], 
functions=[(count(1),mode=Partial,isDistinct=false)], 
output=[gm_prediction#3,count#32L])
       :     +- Project [UDF(features#0) AS gm_prediction#3]
       :        +- INPUT
       +- Scan ExistingRDD[features#0]
    
        at 
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:50)
        at 
org.apache.spark.sql.execution.exchange.ShuffleExchange.doExecute(ShuffleExchange.scala:109)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:137)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
        at 
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:134)
        at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:117)
        at 
org.apache.spark.sql.execution.InputAdapter.upstreams(WholeStageCodegen.scala:237)
        at 
org.apache.spark.sql.execution.aggregate.TungstenAggregate.upstreams(TungstenAggregate.scala:131)
        at 
org.apache.spark.sql.execution.WholeStageCodegen.doExecute(WholeStageCodegen.scala:352)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:137)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
        at 
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:134)
        at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:117)
        at 
org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:230)
        at 
org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:277)
        at 
org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2099)
        at 
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:53)
        at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2386)
        at 
org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2098)
        at 
org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2103)
        at 
org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2103)
        at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2399)
        at 
org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2103)
        at org.apache.spark.sql.Dataset.collect(Dataset.scala:2079)
        at 
org.apache.spark.ml.clustering.GaussianMixtureSummary.clusterSizes$lzycompute(GaussianMixture.scala:327)
        at 
org.apache.spark.ml.clustering.GaussianMixtureSummary.clusterSizes(GaussianMixture.scala:325)
        at 
org.apache.spark.ml.clustering.GaussianMixtureSuite$$anonfun$4.apply$mcV$sp(GaussianMixtureSuite.scala:102)
        at 
org.apache.spark.ml.clustering.GaussianMixtureSuite$$anonfun$4.apply(GaussianMixtureSuite.scala:73)
        at 
org.apache.spark.ml.clustering.GaussianMixtureSuite$$anonfun$4.apply(GaussianMixtureSuite.scala:73)
        at 
org.scalatest.Transformer$$anonfun$apply$1.apply$mcV$sp(Transformer.scala:22)
        at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85)
        at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
        at org.scalatest.Transformer.apply(Transformer.scala:22)
        at org.scalatest.Transformer.apply(Transformer.scala:20)
        at org.scalatest.FunSuiteLike$$anon$1.apply(FunSuiteLike.scala:166)
        at org.apache.spark.SparkFunSuite.withFixture(SparkFunSuite.scala:56)
        at 
org.scalatest.FunSuiteLike$class.invokeWithFixture$1(FunSuiteLike.scala:163)
        at 
org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:175)
        at 
org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:175)
        at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306)
        at org.scalatest.FunSuiteLike$class.runTest(FunSuiteLike.scala:175)
        at org.scalatest.FunSuite.runTest(FunSuite.scala:1555)
        at 
org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:208)
        at 
org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:208)
        at 
org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:413)
        at 
org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:401)
        at scala.collection.immutable.List.foreach(List.scala:381)
        at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401)
        at 
org.scalatest.SuperEngine.org$scalatest$SuperEngine$$runTestsInBranch(Engine.scala:396)
        at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:483)
        at org.scalatest.FunSuiteLike$class.runTests(FunSuiteLike.scala:208)
        at org.scalatest.FunSuite.runTests(FunSuite.scala:1555)
        at org.scalatest.Suite$class.run(Suite.scala:1424)
        at 
org.scalatest.FunSuite.org$scalatest$FunSuiteLike$$super$run(FunSuite.scala:1555)
        at 
org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:212)
        at 
org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:212)
        at org.scalatest.SuperEngine.runImpl(Engine.scala:545)
        at org.scalatest.FunSuiteLike$class.run(FunSuiteLike.scala:212)
        at 
org.apache.spark.SparkFunSuite.org$scalatest$BeforeAndAfterAll$$super$run(SparkFunSuite.scala:28)
        at 
org.scalatest.BeforeAndAfterAll$class.liftedTree1$1(BeforeAndAfterAll.scala:257)
        at 
org.scalatest.BeforeAndAfterAll$class.run(BeforeAndAfterAll.scala:256)
        at org.apache.spark.SparkFunSuite.run(SparkFunSuite.scala:28)
        at org.scalatest.tools.SuiteRunner.run(SuiteRunner.scala:55)
        at 
org.scalatest.tools.Runner$$anonfun$doRunRunRunDaDoRunRun$3.apply(Runner.scala:2563)
        at 
org.scalatest.tools.Runner$$anonfun$doRunRunRunDaDoRunRun$3.apply(Runner.scala:2557)
        at scala.collection.immutable.List.foreach(List.scala:381)
        at org.scalatest.tools.Runner$.doRunRunRunDaDoRunRun(Runner.scala:2557)
        at 
org.scalatest.tools.Runner$$anonfun$runOptionallyWithPassFailReporter$2.apply(Runner.scala:1044)
        at 
org.scalatest.tools.Runner$$anonfun$runOptionallyWithPassFailReporter$2.apply(Runner.scala:1043)
        at 
org.scalatest.tools.Runner$.withClassLoaderAndDispatchReporter(Runner.scala:2722)
        at 
org.scalatest.tools.Runner$.runOptionallyWithPassFailReporter(Runner.scala:1043)
        at org.scalatest.tools.Runner$.run(Runner.scala:883)
        at org.scalatest.tools.Runner.run(Runner.scala)
        at 
org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.runScalaTest2(ScalaTestRunner.java:138)
        at 
org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.main(ScalaTestRunner.java:28)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
        at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:606)
        at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)
    Caused by: org.apache.spark.SparkException: Task not serializable
        at 
org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:305)
        at 
org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:295)
        at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:123)
        at org.apache.spark.SparkContext.clean(SparkContext.scala:1944)
        at 
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:782)
        at 
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:781)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
        at org.apache.spark.rdd.RDD.withScope(RDD.scala:357)
        at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:781)
        at 
org.apache.spark.sql.execution.WholeStageCodegen.doExecute(WholeStageCodegen.scala:355)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:137)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
        at 
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:134)
        at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:117)
        at 
org.apache.spark.sql.execution.exchange.ShuffleExchange.prepareShuffleDependency(ShuffleExchange.scala:82)
        at 
org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:118)
        at 
org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:109)
        at 
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49)
        ... 83 more
    Caused by: java.io.NotSerializableException: org.apache.spark.SparkContext
    Serialization stack:
        - object not serializable (class: org.apache.spark.SparkContext, value: 
org.apache.spark.SparkContext@6eaadd1d)
        - field (class: org.apache.spark.ml.clustering.GaussianMixtureModel, 
name: sc, type: class org.apache.spark.SparkContext)
        - object (class org.apache.spark.ml.clustering.GaussianMixtureModel, 
GaussianMixture_0e35e80dd8f8)
        - field (class: 
org.apache.spark.ml.clustering.GaussianMixtureModel$$anonfun$3, name: $outer, 
type: class org.apache.spark.ml.clustering.GaussianMixtureModel)
        - object (class 
org.apache.spark.ml.clustering.GaussianMixtureModel$$anonfun$3, <function1>)
        - field (class: 
org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2, name: func$2, 
type: interface scala.Function1)
        - object (class 
org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2, <function1>)
        - field (class: org.apache.spark.sql.catalyst.expressions.ScalaUDF, 
name: f, type: interface scala.Function1)
        - object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF, 
UDF(features#0))
        - field (class: org.apache.spark.sql.catalyst.expressions.Alias, name: 
child, type: class org.apache.spark.sql.catalyst.expressions.Expression)
        - object (class org.apache.spark.sql.catalyst.expressions.Alias, 
UDF(features#0) AS gm_prediction#3)
        - element of array (index: 0)
        - array (class [Ljava.lang.Object;, size 1)
        - field (class: scala.collection.mutable.ArrayBuffer, name: array, 
type: class [Ljava.lang.Object;)
        - object (class scala.collection.mutable.ArrayBuffer, 
ArrayBuffer(UDF(features#0) AS gm_prediction#3))
        - field (class: org.apache.spark.sql.execution.Project, name: 
projectList, type: interface scala.collection.Seq)
        - object (class org.apache.spark.sql.execution.Project, Project 
[UDF(features#0) AS gm_prediction#3]
    +- INPUT
    )
        - field (class: 
org.apache.spark.sql.execution.aggregate.TungstenAggregate, name: child, type: 
class org.apache.spark.sql.execution.SparkPlan)
        - object (class 
org.apache.spark.sql.execution.aggregate.TungstenAggregate, 
TungstenAggregate(key=[gm_prediction#3], 
functions=[(count(1),mode=Partial,isDistinct=false)], 
output=[gm_prediction#3,count#32L])
    +- Project [UDF(features#0) AS gm_prediction#3]
       +- INPUT
    )
        - element of array (index: 0)
        - array (class [Ljava.lang.Object;, size 3)
        - field (class: 
org.apache.spark.sql.execution.WholeStageCodegen$$anonfun$6, name: 
references$1, type: class [Ljava.lang.Object;)
        - object (class 
org.apache.spark.sql.execution.WholeStageCodegen$$anonfun$6, <function2>)
        at 
org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
        at 
org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
        at 
org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
        at 
org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:302)
        ... 103 more



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-14433][PySpark][ML]:PySpark ml Gaussian...

Reply via email to