[ https://issues.apache.org/jira/browse/SPARK-29186?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Tarun Khaneja updated SPARK-29186: ---------------------------------- Description: I am writing a program to analyze sql query. So I am using Spark logical plan.I am writing a program to analyze sql query. So I am using Spark logical plan. Below is the code which I am using {code:java} object QueryAnalyzer { val LOG = LoggerFactory.getLogger(this.getClass) //Spark Conf val conf = new SparkConf().setMaster("local[2]").setAppName("LocalEdlExecutor") //Spark Context val sc = new SparkContext(conf) //sql Context val sqlContext = new SQLContext(sc) //Spark Session val sparkSession = SparkSession .builder() .appName("Spark User Data") .config("spark.app.name", "LocalEdl") .getOrCreate() def main(args: Array[String]) { var inputDfColumns = Map[String,List[String]]() val dfSession = sparkSession.read.format("csv"). option("header", "true"). option("inferschema", "true"). option("delimiter", ",").option("decoding", "utf8").option("multiline", true) var oDF = dfSession. load("C:\\Users\\tarun.khaneja\\data\\order.csv") println("smaple data in oDF====>") oDF.show() var cusDF = dfSession. load("C:\\Users\\tarun.khaneja\\data\\customer.csv") println("smaple data in cusDF====>") cusDF.show() oDF.createOrReplaceTempView("orderTempView") cusDF.createOrReplaceTempView("customerTempView") //get input columns from all dataframe inputDfColumns += ("orderTempView"->oDF.columns.toList) inputDfColumns += ("customerTempView"->cusDF.columns.toList) val res = sqlContext.sql("""select OID, max(MID+CID) as MID_new,ROW_NUMBER() OVER ( ORDER BY CID) as rn from (select OID_1 as OID, CID_1 as CID, OID_1+CID_1 as MID from (select min(ot.OrderID) as OID_1, ct.CustomerID as CID_1 from orderTempView as ot inner join customerTempView as ct on ot.CustomerID = ct.CustomerID group by CID_1)) group by OID,CID""") println(res.show(false)) val analyzedPlan = res.queryExecution.analyzed println(analyzedPlan.prettyJson) } {code} Now problem is, with *Spark 2.2.1*, I am getting below json. where I have SubqueryAlias which provide important information of alias name for table which we used in query, as shown below. {noformat} ... ... ... [ { "class" : "org.apache.spark.sql.catalyst.expressions.AttributeReference", "num-children" : 0, "name" : "OrderDate", "dataType" : "string", "nullable" : true, "metadata" : { }, "exprId" : { "product-class" : "org.apache.spark.sql.catalyst.expressions.ExprId", "id" : 2, "jvmId" : "acefe6e6-e469-4c9a-8a36-5694f054dc0a" }, "isGenerated" : false } ] ] }, { "class" : "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", "num-children" : 1, "alias" : "ct", "child" : 0 } , { "class" : "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", "num-children" : 1, "alias" : "customertempview", "child" : 0 } , { "class" : "org.apache.spark.sql.execution.datasources.LogicalRelation", "num-children" : 0, "relation" : null, "output" : ... ... ... But with Spark 2.4.3, I am getting SubqueryAlias name as null. As shown below in json. ... ... { "class": "org.apache.spark.sql.catalyst.expressions.AttributeReference", "num-children": 0, "name": "CustomerID", "dataType": "integer", "nullable": true, "metadata": {}, "exprId": { "product-class": "org.apache.spark.sql.catalyst.expressions.ExprId", "id": 19, "jvmId": "3b0dde0c-0b8f-4c63-a3ed-4dba526f8331" }, "qualifier": "[ct]" }] }, { "class": "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", "num-children": 1, "name": null, "child": 0 } , { "class": "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", "num-children": 1, "name": null, "child": 0 } , { "class": "org.apache.spark.sql.execution.datasources.LogicalRelation", "num-children": 0, "relation": null, "output": ... ...{noformat} ... ... ... [ \{ "class" : "org.apache.spark.sql.catalyst.expressions.AttributeReference", "num-children" : 0, "name" : "OrderDate", "dataType" : "string", "nullable" : true, "metadata" : { }, "exprId" : \{ "product-class" : "org.apache.spark.sql.catalyst.expressions.ExprId", "id" : 2, "jvmId" : "acefe6e6-e469-4c9a-8a36-5694f054dc0a" }, "isGenerated" : false } ] ] }, { "class" : "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", "num-children" : 1, "alias" : "ct", "child" : 0 } , { "class" : "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", "num-children" : 1, "alias" : "customertempview", "child" : 0 } , { "class" : "org.apache.spark.sql.execution.datasources.LogicalRelation", "num-children" : 0, "relation" : null, "output" : ... ... ... But with *Spark 2.4.3*, I am getting SubqueryAlias name as null. As shown below in json. ... ... \{ "class": "org.apache.spark.sql.catalyst.expressions.AttributeReference", "num-children": 0, "name": "CustomerID", "dataType": "integer", "nullable": true, "metadata": {}, "exprId": \{ "product-class": "org.apache.spark.sql.catalyst.expressions.ExprId", "id": 19, "jvmId": "3b0dde0c-0b8f-4c63-a3ed-4dba526f8331" }, "qualifier": "[ct]" }] }, { "class": "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", "num-children": 1, "name": null, "child": 0 } , { "class": "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", "num-children": 1, "name": null, "child": 0 } , { "class": "org.apache.spark.sql.execution.datasources.LogicalRelation", "num-children": 0, "relation": null, "output": ... ... So, I am not sure if it is bug in Spark 2.4 because of which I am getting name as null in SubquerAlias.Or if it is not bug then how can I get relation between alias name and real table name. Any idea on this? was: I am writing a program to analyze sql query. So I am using Spark logical plan.I am writing a program to analyze sql query. So I am using Spark logical plan. Below is the code which I am using {code:java} object QueryAnalyzer { val LOG = LoggerFactory.getLogger(this.getClass) //Spark Conf val conf = new SparkConf().setMaster("local[2]").setAppName("LocalEdlExecutor") //Spark Context val sc = new SparkContext(conf) //sql Context val sqlContext = new SQLContext(sc) //Spark Session val sparkSession = SparkSession .builder() .appName("Spark User Data") .config("spark.app.name", "LocalEdl") .getOrCreate() def main(args: Array[String]) { var inputDfColumns = Map[String,List[String]]() val dfSession = sparkSession.read.format("csv"). option("header", "true"). option("inferschema", "true"). option("delimiter", ",").option("decoding", "utf8").option("multiline", true) var oDF = dfSession. load("C:\\Users\\tarun.khaneja\\data\\order.csv") println("smaple data in oDF====>") oDF.show() var cusDF = dfSession. load("C:\\Users\\tarun.khaneja\\data\\customer.csv") println("smaple data in cusDF====>") cusDF.show() oDF.createOrReplaceTempView("orderTempView") cusDF.createOrReplaceTempView("customerTempView") //get input columns from all dataframe inputDfColumns += ("orderTempView"->oDF.columns.toList) inputDfColumns += ("customerTempView"->cusDF.columns.toList) val res = sqlContext.sql("""select OID, max(MID+CID) as MID_new,ROW_NUMBER() OVER ( ORDER BY CID) as rn from (select OID_1 as OID, CID_1 as CID, OID_1+CID_1 as MID from (select min(ot.OrderID) as OID_1, ct.CustomerID as CID_1 from orderTempView as ot inner join customerTempView as ct on ot.CustomerID = ct.CustomerID group by CID_1)) group by OID,CID""") println(res.show(false)) val analyzedPlan = res.queryExecution.analyzed println(analyzedPlan.prettyJson) } {code} Now problem is, with *Spark 2.2.1*, I am getting below json. where I have SubqueryAlias which provide important information of alias name for table which we used in query, as shown below. ... ... ... [ \{ "class" : "org.apache.spark.sql.catalyst.expressions.AttributeReference", "num-children" : 0, "name" : "OrderDate", "dataType" : "string", "nullable" : true, "metadata" : { }, "exprId" : \{ "product-class" : "org.apache.spark.sql.catalyst.expressions.ExprId", "id" : 2, "jvmId" : "acefe6e6-e469-4c9a-8a36-5694f054dc0a" }, "isGenerated" : false } ] ] }, { "class" : "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", "num-children" : 1, "alias" : "ct", "child" : 0 } , { "class" : "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", "num-children" : 1, "alias" : "customertempview", "child" : 0 } , { "class" : "org.apache.spark.sql.execution.datasources.LogicalRelation", "num-children" : 0, "relation" : null, "output" : ... ... ... But with *Spark 2.4.3*, I am getting SubqueryAlias name as null. As shown below in json. ... ... \{ "class": "org.apache.spark.sql.catalyst.expressions.AttributeReference", "num-children": 0, "name": "CustomerID", "dataType": "integer", "nullable": true, "metadata": {}, "exprId": \{ "product-class": "org.apache.spark.sql.catalyst.expressions.ExprId", "id": 19, "jvmId": "3b0dde0c-0b8f-4c63-a3ed-4dba526f8331" }, "qualifier": "[ct]" }] }, { "class": "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", "num-children": 1, "name": null, "child": 0 } , { "class": "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", "num-children": 1, "name": null, "child": 0 } , { "class": "org.apache.spark.sql.execution.datasources.LogicalRelation", "num-children": 0, "relation": null, "output": ... ... So, I am not sure if it is bug in Spark 2.4 because of which I am getting name as null in SubquerAlias.Or if it is not bug then how can I get relation between alias name and real table name. Any idea on this? > SubqueryAlias name value is null in Spark 2.4.3 Logical plan. > ------------------------------------------------------------- > > Key: SPARK-29186 > URL: https://issues.apache.org/jira/browse/SPARK-29186 > Project: Spark > Issue Type: Bug > Components: Spark Core > Affects Versions: 2.4.3 > Environment: I have tried this on AWS Glue with Spark 2.4.3 > and on windows 10 with 2.4.4 > at both of them facing same issue > Reporter: Tarun Khaneja > Priority: Blocker > Fix For: 2.2.1 > > > I am writing a program to analyze sql query. So I am using Spark logical > plan.I am writing a program to analyze sql query. So I am using Spark logical > plan. > Below is the code which I am using > > {code:java} > object QueryAnalyzer > { > val LOG = LoggerFactory.getLogger(this.getClass) //Spark Conf > > val conf = new > SparkConf().setMaster("local[2]").setAppName("LocalEdlExecutor") > //Spark Context > val sc = new SparkContext(conf) > //sql Context > val sqlContext = new SQLContext(sc) > > //Spark Session > val sparkSession = SparkSession > .builder() > .appName("Spark User Data") .config("spark.app.name", "LocalEdl") > .getOrCreate() > def main(args: Array[String]) > { > var inputDfColumns = Map[String,List[String]]() > val dfSession = sparkSession.read.format("csv"). option("header", > "true"). option("inferschema", "true"). option("delimiter", > ",").option("decoding", "utf8").option("multiline", true) > > var oDF = dfSession. load("C:\\Users\\tarun.khaneja\\data\\order.csv") > > println("smaple data in oDF====>") > > oDF.show() > var cusDF = dfSession. > load("C:\\Users\\tarun.khaneja\\data\\customer.csv") > println("smaple data in cusDF====>") cusDF.show() > oDF.createOrReplaceTempView("orderTempView") > cusDF.createOrReplaceTempView("customerTempView") > > //get input columns from all dataframe > inputDfColumns += > ("orderTempView"->oDF.columns.toList) > > inputDfColumns += > ("customerTempView"->cusDF.columns.toList) > > val res = sqlContext.sql("""select OID, max(MID+CID) as MID_new,ROW_NUMBER() > OVER ( > ORDER BY CID) as rn from (select OID_1 as OID, > CID_1 as CID, OID_1+CID_1 as MID from (select min(ot.OrderID) as OID_1, > ct.CustomerID as CID_1 from orderTempView as ot inner join customerTempView > as ct on ot.CustomerID = ct.CustomerID group by > CID_1)) group by OID,CID""") > println(res.show(false)) > val analyzedPlan = res.queryExecution.analyzed > println(analyzedPlan.prettyJson) > } > {code} > > Now problem is, with *Spark 2.2.1*, I am getting below json. where I have > SubqueryAlias which provide important information of alias name for table > which we used in query, as shown below. > > {noformat} > ... ... ... [ { "class" : > "org.apache.spark.sql.catalyst.expressions.AttributeReference", > "num-children" : 0, "name" : "OrderDate", "dataType" : "string", "nullable" : > true, "metadata" : { }, "exprId" : { "product-class" : > "org.apache.spark.sql.catalyst.expressions.ExprId", "id" : 2, "jvmId" : > "acefe6e6-e469-4c9a-8a36-5694f054dc0a" }, "isGenerated" : false } ] ] }, > { "class" : "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", > "num-children" : 1, "alias" : "ct", "child" : 0 } > , > { "class" : "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", > "num-children" : 1, "alias" : "customertempview", "child" : 0 } > , { "class" : "org.apache.spark.sql.execution.datasources.LogicalRelation", > "num-children" : 0, "relation" : null, "output" : > ... ... ... > But with Spark 2.4.3, I am getting SubqueryAlias name as null. As shown > below in json. > ... ... { "class": > "org.apache.spark.sql.catalyst.expressions.AttributeReference", > "num-children": 0, "name": "CustomerID", "dataType": "integer", "nullable": > true, "metadata": {}, "exprId": { "product-class": > "org.apache.spark.sql.catalyst.expressions.ExprId", "id": 19, "jvmId": > "3b0dde0c-0b8f-4c63-a3ed-4dba526f8331" }, "qualifier": "[ct]" }] }, > { "class": "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", > "num-children": 1, "name": null, "child": 0 } > , > { "class": "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", > "num-children": 1, "name": null, "child": 0 } > , { "class": "org.apache.spark.sql.execution.datasources.LogicalRelation", > "num-children": 0, "relation": null, "output": > ... ...{noformat} > ... ... ... [ \{ "class" : > "org.apache.spark.sql.catalyst.expressions.AttributeReference", > "num-children" : 0, "name" : "OrderDate", "dataType" : "string", "nullable" : > true, "metadata" : { }, "exprId" : \{ "product-class" : > "org.apache.spark.sql.catalyst.expressions.ExprId", "id" : 2, "jvmId" : > "acefe6e6-e469-4c9a-8a36-5694f054dc0a" }, "isGenerated" : false } ] ] }, > { "class" : "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", > "num-children" : 1, "alias" : "ct", "child" : 0 } > , > { "class" : "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", > "num-children" : 1, "alias" : "customertempview", "child" : 0 } > , { "class" : "org.apache.spark.sql.execution.datasources.LogicalRelation", > "num-children" : 0, "relation" : null, "output" : > ... ... ... > But with *Spark 2.4.3*, I am getting SubqueryAlias name as null. As shown > below in json. > ... ... \{ "class": > "org.apache.spark.sql.catalyst.expressions.AttributeReference", > "num-children": 0, "name": "CustomerID", "dataType": "integer", "nullable": > true, "metadata": {}, "exprId": \{ "product-class": > "org.apache.spark.sql.catalyst.expressions.ExprId", "id": 19, "jvmId": > "3b0dde0c-0b8f-4c63-a3ed-4dba526f8331" }, "qualifier": "[ct]" }] }, > { "class": "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", > "num-children": 1, "name": null, "child": 0 } > , > { "class": "org.apache.spark.sql.catalyst.plans.logical.*SubqueryAlias*", > "num-children": 1, "name": null, "child": 0 } > , { "class": "org.apache.spark.sql.execution.datasources.LogicalRelation", > "num-children": 0, "relation": null, "output": > ... ... > So, I am not sure if it is bug in Spark 2.4 because of which I am getting > name as null in SubquerAlias.Or if it is not bug then how can I get relation > between alias name and real table name. > Any idea on this? -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org