[ https://issues.apache.org/jira/browse/SPARK-12586?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
shao lo updated SPARK-12586: ---------------------------- Description: The following sequence of sql(), registerTempTable() calls gets the wrong answer. The correct answer is returned if the temp table is rewritten? sql_text = """select row, col, foo, bar, value2 value from (select row, col, foo, bar, 8 value2 from t0 where row=1 and col=2) s1 union select row, col, foo, bar, value from t0 where not (row=1 and col=2)""" df2 = sqlContext.sql(sql_text) df2.registerTempTable("t1") # # The following 2 line workaround fixes the problem somehow? # df3 = sqlContext.createDataFrame(df2.collect()) # df3.registerTempTable("t1") # # The following 4 line workaround fixes the problem too..but takes way longer # filename = "t1.json" # df2.write.json(filename, mode='overwrite') # df3 = sqlContext.read.json(filename) # df3.registerTempTable("t1") sql_text2 = """select row, col, v1 value from (select v1 from (select v_value v1 from values) s1 left join (select value v2,foo,bar,row,col from t1 where foo=1 and bar=2 and value is not null) s2 on v1=v2 where v2 is null ) sa join (select row, col from t1 where foo=1 and bar=2 and value is null) sb""" result = sqlContext.sql(sql_text2) result.show() # Expected result # +---+---+-----+ # |row|col|value| # +---+---+-----+ # | 3| 4| 1| # | 3| 4| 2| # | 3| 4| 3| # | 3| 4| 4| # +---+---+-----+ # Getting this wrong result...when not using the workarounds above # +---+---+-----+ # |row|col|value| # +---+---+-----+ # +---+---+-----+ was: The following python script gets the wrong answer unless workarounds are used... from pyspark import SparkContext from pyspark.sql import SQLContext if __name__ == "__main__": sc = SparkContext(appName="PythonSQLbug") sqlContext = SQLContext(sc) data = [(v,) for v in range(1, 5)] values = sqlContext.createDataFrame(data, ['v_value']) values.registerTempTable("values") values.show() data = [ (3, 1, 1, 1, None), (2, 1, 1, 1, 3), (3, 2, 1, 1, None), (3, 3, 1, 1, 2), (3, 4, 1, 2, None)] df1 = sqlContext.createDataFrame(data, ['row', 'col', 'foo', 'bar', 'value']) df1.registerTempTable("t0") df1.show() sql_text = """select row, col, foo, bar, value2 value from (select row, col, foo, bar, 8 value2 from t0 where row=1 and col=2) s1 union select row, col, foo, bar, value from t0 where not (row=1 and col=2)""" df2 = sqlContext.sql(sql_text) df2.registerTempTable("t1") # # The following 2 line workaround fixes the problem somehow? # df3 = sqlContext.createDataFrame(df2.collect()) # df3.registerTempTable("t1") # # The following 4 line workaround fixes the problem too..but takes way longer # filename = "t1.json" # df2.write.json(filename, mode='overwrite') # df3 = sqlContext.read.json(filename) # df3.registerTempTable("t1") sql_text2 = """select row, col, v1 value from (select v1 from (select v_value v1 from values) s1 left join (select value v2,foo,bar,row,col from t1 where foo=1 and bar=2 and value is not null) s2 on v1=v2 where v2 is null ) sa join (select row, col from t1 where foo=1 and bar=2 and value is null) sb""" result = sqlContext.sql(sql_text2) result.show() # Expected result # +---+---+-----+ # |row|col|value| # +---+---+-----+ # | 3| 4| 1| # | 3| 4| 2| # | 3| 4| 3| # | 3| 4| 4| # +---+---+-----+ # Getting this wrong result...when not using the workarounds above # +---+---+-----+ # |row|col|value| # +---+---+-----+ # +---+---+-----+ sc.stop() > Wrong answer with registerTempTable and union sql query > ------------------------------------------------------- > > Key: SPARK-12586 > URL: https://issues.apache.org/jira/browse/SPARK-12586 > Project: Spark > Issue Type: Bug > Affects Versions: 1.5.2 > Environment: Windows 7 > Reporter: shao lo > Attachments: sql_bug.py > > > The following sequence of sql(), registerTempTable() calls gets the wrong > answer. > The correct answer is returned if the temp table is rewritten? > sql_text = """select row, col, foo, bar, value2 value > from (select row, col, foo, bar, 8 value2 from t0 where row=1 > and col=2) s1 > union select row, col, foo, bar, value from t0 where > not (row=1 and col=2)""" > df2 = sqlContext.sql(sql_text) > df2.registerTempTable("t1") > # # The following 2 line workaround fixes the problem somehow? > # df3 = sqlContext.createDataFrame(df2.collect()) > # df3.registerTempTable("t1") > # # The following 4 line workaround fixes the problem too..but takes way > longer > # filename = "t1.json" > # df2.write.json(filename, mode='overwrite') > # df3 = sqlContext.read.json(filename) > # df3.registerTempTable("t1") > sql_text2 = """select row, col, v1 value from > (select v1 from > (select v_value v1 from values) s1 > left join > (select value v2,foo,bar,row,col from t1 > where foo=1 > and bar=2 and value is not null) s2 > on v1=v2 where v2 is null > ) sa join > (select row, col from t1 where foo=1 > and bar=2 and value is null) sb""" > result = sqlContext.sql(sql_text2) > result.show() > > # Expected result > # +---+---+-----+ > # |row|col|value| > # +---+---+-----+ > # | 3| 4| 1| > # | 3| 4| 2| > # | 3| 4| 3| > # | 3| 4| 4| > # +---+---+-----+ > # Getting this wrong result...when not using the workarounds above > # +---+---+-----+ > # |row|col|value| > # +---+---+-----+ > # +---+---+-----+ -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org