[ 
https://issues.apache.org/jira/browse/SPARK-47104?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17818934#comment-17818934
 ] 

Bruce Robbins commented on SPARK-47104:
---------------------------------------

It's not a CSV specific issue. You can reproduce with a cached view. The 
following fails on the master branch, when using {{spark-sql}}:
{noformat}
create or replace temp view v1(id, name) as values
(1, "fred"),
(2, "bob");

cache table v1;

select name, uuid() as _iid from (
  select s.name
  from v1 s
  join v1 t
  on s.name = t.name
  order by name
)
limit 20;
{noformat}
The exception is:
{noformat}
java.lang.NullPointerException: Cannot invoke 
"org.apache.spark.sql.catalyst.util.RandomUUIDGenerator.getNextUUIDUTF8String()"
 because "this.randomGen_0" is null
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown
 Source)
        at 
org.apache.spark.sql.execution.TakeOrderedAndProjectExec.$anonfun$executeCollect$6(limit.scala:297)
        at scala.collection.ArrayOps$.map$extension(ArrayOps.scala:934)
        at 
org.apache.spark.sql.execution.TakeOrderedAndProjectExec.$anonfun$executeCollect$1(limit.scala:297)
        at 
org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
        at 
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
        at 
org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:286)
        at 
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:390)
        at 
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:418)
        at 
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:390)
{noformat}
It seems that non-deterministic expressions are not getting initialized before 
being used in the unsafe projection. I can take a look.

> Spark SQL query fails with NullPointerException
> -----------------------------------------------
>
>                 Key: SPARK-47104
>                 URL: https://issues.apache.org/jira/browse/SPARK-47104
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 3.2.1
>            Reporter: Chhavi Bansal
>            Priority: Major
>
> I am trying to run a very simple SQL query involving join and orderby clause 
> and then using UUID() function in the outermost select stmt. The query fails
> {code:java}
> val df = spark.read.format("csv").option("header", 
> "true").load("src/main/resources/titanic.csv")
> df.createOrReplaceTempView("titanic")
> val query = spark.sql(" select name, uuid() as _iid from (select s.name from 
> titanic s join titanic t on s.name = t.name order by name) ;") 
> query.show() // FAILS{code}
> Dataset is a normal csv file with the following columns
> {code:java}
> PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
>  {code}
> Below is the error
> {code:java}
> Exception in thread "main" java.lang.NullPointerException
> at 
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown
>  Source)
> at 
> org.apache.spark.sql.execution.TakeOrderedAndProjectExec.$anonfun$executeCollect$2(limit.scala:207)
> at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:237)
> at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
> at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
> at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
> at scala.collection.TraversableLike.map(TraversableLike.scala:237)
> at scala.collection.TraversableLike.map$(TraversableLike.scala:230)
> at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
> at 
> org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:207)
> at 
> org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:338)
> at 
> org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:366)
> at 
> org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:338)
> at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3715)
> at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2728)
> at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3706)
> at 
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
> at 
> org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
> at 
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
> at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
> at 
> org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
> at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3704)
> at org.apache.spark.sql.Dataset.head(Dataset.scala:2728)
> at org.apache.spark.sql.Dataset.take(Dataset.scala:2935)
> at org.apache.spark.sql.Dataset.getRows(Dataset.scala:287)
> at org.apache.spark.sql.Dataset.showString(Dataset.scala:326)
> at org.apache.spark.sql.Dataset.show(Dataset.scala:808)
> at org.apache.spark.sql.Dataset.show(Dataset.scala:785)
> at 
> hyperspace2.sparkPlan$.delayedEndpoint$hyperspace2$sparkPlan$1(sparkPlan.scala:14)
> at hyperspace2.sparkPlan$delayedInit$body.apply(sparkPlan.scala:6)
> at scala.Function0.apply$mcV$sp(Function0.scala:39)
> at scala.Function0.apply$mcV$sp$(Function0.scala:39)
> at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:17)
> at scala.App.$anonfun$main$1$adapted(App.scala:80)
> at scala.collection.immutable.List.foreach(List.scala:392)
> at scala.App.main(App.scala:80)
> at scala.App.main$(App.scala:78)
> at hyperspace2.sparkPlan$.main(sparkPlan.scala:6)
> at hyperspace2.sparkPlan.main(sparkPlan.scala) {code}
> Note:
>  # here if I remove order by clause then it produces the correct output.
>  # This happens when I read the dataset using csv file, works fine if I make 
> the dataframe using Seq().toDf
>  # The query fails if I use spark.sql("query").show() but is success when I 
> simple write it to csv file
> [https://stackoverflow.com/questions/78020267/spark-sql-query-fails-with-nullpointerexception]
> Please can someone look into why this happens just when using `show()` since 
> this is failing queries in production for me.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to