[ 
https://issues.apache.org/jira/browse/SPARK-47230?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Ling Qin updated SPARK-47230:
-----------------------------
    Description: 
See the code snippet below, when explode an array of struct and select one 
field in the struct,  some unexpected behaviour observed:
 * If the field in the struct is in the select clause, not in the where clause, 
the column pruning works as expected.
 * If the field in the struct is in the select clause and in the where clause, 
the column pruning not working.
 * If the field in the struct is not even selected, the column pruning not 
working

{code:java}
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StructField, StructType, ArrayType
import random

spark = SparkSession.builder.appName("example").getOrCreate()# Create an RDD 
with an array of structs, each array having a random size between 5 to 10
rdd = spark.range(1000).rdd.map(lambda x: (x.id + 3, [(x.id + i, x.id - i) for 
i in range(1, random.randint(5, 11))]))

# Define a new schema

schema = StructType([
    StructField("a", IntegerType(), True),
    StructField("b", ArrayType(StructType([
        StructField("x", IntegerType(), True),
        StructField("y", IntegerType(), True)
    ])), True)
])

# Create a DataFrame with the new schema
df = spark.createDataFrame(rdd, schema=schema)

# Write the DataFrame to a parquet file
df.repartition(1).write.mode('overwrite').parquet('test.parquet')

# Read the parquet file back into a DataFrame
df = spark.read.parquet('test.parquet')

spark.conf.set('spark.sql.optimizer.nestedSchemaPruning.enabled', 'true')

# case 1, as expected
sql_query = """
SELECT a, EXPLODE(b.x) AS bb
FROM df_view
"""
spark.sql(sql_query).explain()

# ReadSchema: struct<a:int,b:array<struct<x:int>>>

# case 2, as expected
sql_query = """
SELECT a, bb.x 
FROM df_view 
lateral view explode(b) as bb
"""
spark.sql(sql_query).explain()

# ReadSchema: struct<a:int,b:array<struct<x:int>>>

# case 3, bug: should only read b.x
sql_query = """
SELECT a, bb.x 
FROM df_view 
lateral view explode(b) as bb
where bb.x is not null
"""
spark.sql(sql_query).explain()

#ReadSchema: struct<a:int,b:array<struct<x:int,y:int>>>

#case 4, bug? seems no need to read both a and b
sql_query = """
SELECT a
FROM df_view 
lateral view explode(b) as bb
"""
spark.sql(sql_query).explain()

#ReadSchema: struct<a:int,b:array<struct<x:int,y:int>>>{code}
 
 
 

  was:
{code:java}
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StructField, StructType, ArrayType
import random

spark = SparkSession.builder.appName("example").getOrCreate()# Create an RDD 
with an array of structs, each array having a random size between 5 to 10
rdd = spark.range(1000).rdd.map(lambda x: (x.id + 3, [(x.id + i, x.id - i) for 
i in range(1, random.randint(5, 11))]))

# Define a new schema

schema = StructType([
    StructField("a", IntegerType(), True),
    StructField("b", ArrayType(StructType([
        StructField("x", IntegerType(), True),
        StructField("y", IntegerType(), True)
    ])), True)
])

# Create a DataFrame with the new schema
df = spark.createDataFrame(rdd, schema=schema)

# Write the DataFrame to a parquet file
df.repartition(1).write.mode('overwrite').parquet('test.parquet')

# Read the parquet file back into a DataFrame
df = spark.read.parquet('test.parquet')

spark.conf.set('spark.sql.optimizer.nestedSchemaPruning.enabled', 'true')

# case 1, as expected
sql_query = """
SELECT a, EXPLODE(b.x) AS bb
FROM df_view
"""
spark.sql(sql_query).explain()

# ReadSchema: struct<a:int,b:array<struct<x:int>>>

# case 2, as expected
sql_query = """
SELECT a, bb.x 
FROM df_view 
lateral view explode(b) as bb
"""
spark.sql(sql_query).explain()

# ReadSchema: struct<a:int,b:array<struct<x:int>>>

# case 3, bug: should only read b.x
sql_query = """
SELECT a, bb.x 
FROM df_view 
lateral view explode(b) as bb
where bb.x is not null
"""
spark.sql(sql_query).explain()

#ReadSchema: struct<a:int,b:array<struct<x:int,y:int>>>

#case 4, bug? seems no need to read both a and b
sql_query = """
SELECT a
FROM df_view 
lateral view explode(b) as bb
"""
spark.sql(sql_query).explain()

#ReadSchema: struct<a:int,b:array<struct<x:int,y:int>>>{code}
 
 
 


> The column pruning is not working as expected for the nested struct in an 
> array
> -------------------------------------------------------------------------------
>
>                 Key: SPARK-47230
>                 URL: https://issues.apache.org/jira/browse/SPARK-47230
>             Project: Spark
>          Issue Type: Bug
>          Components: Optimizer, Spark Core, Spark Shell
>    Affects Versions: 3.3.0, 3.4.0, 3.5.0, 3.5.1
>            Reporter: Ling Qin
>            Priority: Major
>
> See the code snippet below, when explode an array of struct and select one 
> field in the struct,  some unexpected behaviour observed:
>  * If the field in the struct is in the select clause, not in the where 
> clause, the column pruning works as expected.
>  * If the field in the struct is in the select clause and in the where 
> clause, the column pruning not working.
>  * If the field in the struct is not even selected, the column pruning not 
> working
> {code:java}
> from pyspark.sql import SparkSession
> from pyspark.sql.types import IntegerType, StructField, StructType, ArrayType
> import random
> spark = SparkSession.builder.appName("example").getOrCreate()# Create an RDD 
> with an array of structs, each array having a random size between 5 to 10
> rdd = spark.range(1000).rdd.map(lambda x: (x.id + 3, [(x.id + i, x.id - i) 
> for i in range(1, random.randint(5, 11))]))
> # Define a new schema
> schema = StructType([
>     StructField("a", IntegerType(), True),
>     StructField("b", ArrayType(StructType([
>         StructField("x", IntegerType(), True),
>         StructField("y", IntegerType(), True)
>     ])), True)
> ])
> # Create a DataFrame with the new schema
> df = spark.createDataFrame(rdd, schema=schema)
> # Write the DataFrame to a parquet file
> df.repartition(1).write.mode('overwrite').parquet('test.parquet')
> # Read the parquet file back into a DataFrame
> df = spark.read.parquet('test.parquet')
> spark.conf.set('spark.sql.optimizer.nestedSchemaPruning.enabled', 'true')
> # case 1, as expected
> sql_query = """
> SELECT a, EXPLODE(b.x) AS bb
> FROM df_view
> """
> spark.sql(sql_query).explain()
> # ReadSchema: struct<a:int,b:array<struct<x:int>>>
> # case 2, as expected
> sql_query = """
> SELECT a, bb.x 
> FROM df_view 
> lateral view explode(b) as bb
> """
> spark.sql(sql_query).explain()
> # ReadSchema: struct<a:int,b:array<struct<x:int>>>
> # case 3, bug: should only read b.x
> sql_query = """
> SELECT a, bb.x 
> FROM df_view 
> lateral view explode(b) as bb
> where bb.x is not null
> """
> spark.sql(sql_query).explain()
> #ReadSchema: struct<a:int,b:array<struct<x:int,y:int>>>
> #case 4, bug? seems no need to read both a and b
> sql_query = """
> SELECT a
> FROM df_view 
> lateral view explode(b) as bb
> """
> spark.sql(sql_query).explain()
> #ReadSchema: struct<a:int,b:array<struct<x:int,y:int>>>{code}
>  
>  
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to