Ling Qin created SPARK-47230:
--------------------------------
Summary: The column pruning is not working as expected for the
nested struct in an array
Key: SPARK-47230
URL: https://issues.apache.org/jira/browse/SPARK-47230
Project: Spark
Issue Type: Bug
Components: Optimizer, Spark Core, Spark Shell
Affects Versions: 3.5.1, 3.5.0, 3.4.0, 3.3.0
Reporter: Ling Qin
{code:java}
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StructField, StructType, ArrayType
import random
spark = SparkSession.builder.appName("example").getOrCreate()# Create an RDD
with an array of structs, each array having a random size between 5 to 10
rdd = spark.range(1000).rdd.map(lambda x: (x.id + 3, [(x.id + i, x.id - i) for
i in range(1, random.randint(5, 11))]))
# Define a new schema
schema = StructType([
StructField("a", IntegerType(), True),
StructField("b", ArrayType(StructType([
StructField("x", IntegerType(), True),
StructField("y", IntegerType(), True)
])), True)
])
# Create a DataFrame with the new schema
df = spark.createDataFrame(rdd, schema=schema)
# Write the DataFrame to a parquet file
df.repartition(1).write.mode('overwrite').parquet('test.parquet')
# Read the parquet file back into a DataFrame
df = spark.read.parquet('test.parquet')
spark.conf.set('spark.sql.optimizer.nestedSchemaPruning.enabled', 'true')
# case 1, as expected
sql_query = """
SELECT a, EXPLODE(b.x) AS bb
FROM df_view
"""
spark.sql(sql_query).explain()
# ReadSchema: struct<a:int,b:array<struct<x:int>>>
# case 2, as expected
sql_query = """
SELECT a, bb.x
FROM df_view
lateral view explode(b) as bb
"""
spark.sql(sql_query).explain()
# ReadSchema: struct<a:int,b:array<struct<x:int>>>
# case 3, bug: should only read b.x
sql_query = """
SELECT a, bb.x
FROM df_view
lateral view explode(b) as bb
where bb.x is not null
"""
spark.sql(sql_query).explain()
#ReadSchema: struct<a:int,b:array<struct<x:int,y:int>>>
#case 4, bug? seems no need to read both a and b
sql_query = """
SELECT a
FROM df_view
lateral view explode(b) as bb
"""
spark.sql(sql_query).explain()
#ReadSchema: struct<a:int,b:array<struct<x:int,y:int>>>{code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]