richard gooding created SPARK-46093:
---------------------------------------

             Summary: append to parquet file with column type changed corrupts 
fie
                 Key: SPARK-46093
                 URL: https://issues.apache.org/jira/browse/SPARK-46093
             Project: Spark
          Issue Type: Bug
          Components: Input/Output
    Affects Versions: 3.3.0
            Reporter: richard gooding


from pyspark.sql.functions import *
from pyspark.sql.types import *

fnBad = "dbfs:/tmp/richard.good...@os.uk/test_bad_parquet/f1"
df = spark.createDataFrame( [ ["aaaa" ] ] ).select( col("_1").alias("aa") )
df.printSchema()

fmt = "parquet"
# fmt = "delta"
df.write.mode("overwrite").format( fmt ) .save( fnBad )
df.show()

df = df.withColumn( "aa", struct( col("aa")) ) # change type of column - error 
on load
df.printSchema()
df.show()
df.write.mode("append").format( fmt).save( fnBad ) # format = delta :   
"AnalysisException: Failed to merge fields 'aa' and 'aa'. Failed to merge 
incompatible data types StringType and 
StructType(StructField(aa,StringType,true))"
# df.write.mode("append").option("mergeSchema", "true").format(fmt).save( fnBad 
) # gives a different error, but only when dataframe read

print(" --- at df 2 --- ")
df2 = spark.read.format(fmt).load( fnBad )
# df2 = spark.read.option("mergeSchema", "true").format(fmt).load( fnBad )
df2.show()  # this will error - 



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to