[
https://issues.apache.org/jira/browse/SPARK-26012?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
eaton updated SPARK-26012:
--------------------------
Description:
Dynamic partition will fail when both '' and null values are taken as dynamic
partition values simultaneously.
For example, the test bellow will fail.
test("Null and '' values should not cause dynamic partition failure of string
types") {
withTable("t1", "t2")
{ spark.range(3).write.saveAsTable("t1") spark.sql("select id, cast(case when
id = 1 then '' else null end as string) as p" + " from
t1").write.partitionBy("p").saveAsTable("t2")
checkAnswer(spark.table("t2").sort("id"), Seq(Row(0, null), Row(1, null),
Row(2, null))) }
}
The error is: 'org.apache.hadoop.fs.FileAlreadyExistsException: File already
exists'.
Caused by: org.apache.hadoop.fs.FileAlreadyExistsException: File already
exists:
[file:/F:/learning/spark/spark_master/spark_compile/spark-warehouse/t2/_temporary/0/_temporary/attempt_20181111204354_0001_m_000000_0/p=__HIVE_DEFAULT_PARTITION__/part-00000-96217c96-3695-4f18-b0db-4f35a9078a3d.c000.snappy.parquet|file:///F:/learning/spark/spark_master/spark_compile/spark-warehouse/t2/_temporary/0/_temporary/attempt_20181111204354_0001_m_000000_0/p=__HIVE_DEFAULT_PARTITION__/part-00000-96217c96-3695-4f18-b0db-4f35a9078a3d.c000.snappy.parquet]
at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:289)
at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328)
at
org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:892)
at
org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
at
org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:248)
at
org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:390)
at
org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:349)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:37)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anon$1.newInstance(ParquetFileFormat.scala:151)
at
org.apache.spark.sql.execution.datasources.DynamicPartitionDataWriter.newOutputWriter(FileFormatDataWriter.scala:236)
at
org.apache.spark.sql.execution.datasources.DynamicPartitionDataWriter.write(FileFormatDataWriter.scala:260)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:242)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:239)
at
org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:245)
... 10 more
20:43:55.460 WARN
org.apache.spark.sql.execution.datasources.FileFormatWriterSuite:
was:
Dynamic partition will fail when both '' and null values are taken as dynamic
partition values simultaneously.
For example, the test bellow will fail before this PR:
test("Null and '' values should not cause dynamic partition failure of string
types") {
withTable("t1", "t2") {
spark.range(3).write.saveAsTable("t1")
spark.sql("select id, cast(case when id = 1 then '' else null end as string)
as p" +
" from t1").write.partitionBy("p").saveAsTable("t2")
checkAnswer(spark.table("t2").sort("id"), Seq(Row(0, null), Row(1, null),
Row(2, null)))
}
}
The error is: 'org.apache.hadoop.fs.FileAlreadyExistsException: File already
exists'.
Caused by: org.apache.hadoop.fs.FileAlreadyExistsException: File already
exists:
file:/F:/learning/spark/spark_master/spark_compile/spark-warehouse/t2/_temporary/0/_temporary/attempt_20181111204354_0001_m_000000_0/p=__HIVE_DEFAULT_PARTITION__/part-00000-96217c96-3695-4f18-b0db-4f35a9078a3d.c000.snappy.parquet
at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:289)
at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328)
at
org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:892)
at
org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
at
org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:248)
at
org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:390)
at
org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:349)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:37)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anon$1.newInstance(ParquetFileFormat.scala:151)
at
org.apache.spark.sql.execution.datasources.DynamicPartitionDataWriter.newOutputWriter(FileFormatDataWriter.scala:236)
at
org.apache.spark.sql.execution.datasources.DynamicPartitionDataWriter.write(FileFormatDataWriter.scala:260)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:242)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:239)
at
org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:245)
... 10 more
20:43:55.460 WARN
org.apache.spark.sql.execution.datasources.FileFormatWriterSuite:
> Dynamic partition will fail when both '' and null values are taken as dynamic
> partition values simultaneously.
> --------------------------------------------------------------------------------------------------------------
>
> Key: SPARK-26012
> URL: https://issues.apache.org/jira/browse/SPARK-26012
> Project: Spark
> Issue Type: Improvement
> Components: SQL
> Affects Versions: 2.4.0
> Reporter: eaton
> Priority: Major
>
> Dynamic partition will fail when both '' and null values are taken as dynamic
> partition values simultaneously.
> For example, the test bellow will fail.
> test("Null and '' values should not cause dynamic partition failure of string
> types") {
> withTable("t1", "t2")
> { spark.range(3).write.saveAsTable("t1") spark.sql("select id, cast(case when
> id = 1 then '' else null end as string) as p" + " from
> t1").write.partitionBy("p").saveAsTable("t2")
> checkAnswer(spark.table("t2").sort("id"), Seq(Row(0, null), Row(1, null),
> Row(2, null))) }
> }
> The error is: 'org.apache.hadoop.fs.FileAlreadyExistsException: File already
> exists'.
>
> Caused by: org.apache.hadoop.fs.FileAlreadyExistsException: File already
> exists:
> [file:/F:/learning/spark/spark_master/spark_compile/spark-warehouse/t2/_temporary/0/_temporary/attempt_20181111204354_0001_m_000000_0/p=__HIVE_DEFAULT_PARTITION__/part-00000-96217c96-3695-4f18-b0db-4f35a9078a3d.c000.snappy.parquet|file:///F:/learning/spark/spark_master/spark_compile/spark-warehouse/t2/_temporary/0/_temporary/attempt_20181111204354_0001_m_000000_0/p=__HIVE_DEFAULT_PARTITION__/part-00000-96217c96-3695-4f18-b0db-4f35a9078a3d.c000.snappy.parquet]
> at
> org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:289)
> at
> org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328)
> at
> org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398)
> at
> org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461)
> at
> org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
> at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
> at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:892)
> at
> org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
> at
> org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:248)
> at
> org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:390)
> at
> org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:349)
> at
> org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:37)
> at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anon$1.newInstance(ParquetFileFormat.scala:151)
> at
> org.apache.spark.sql.execution.datasources.DynamicPartitionDataWriter.newOutputWriter(FileFormatDataWriter.scala:236)
> at
> org.apache.spark.sql.execution.datasources.DynamicPartitionDataWriter.write(FileFormatDataWriter.scala:260)
> at
> org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:242)
> at
> org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:239)
> at
> org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
> at
> org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:245)
> ... 10 more
> 20:43:55.460 WARN
> org.apache.spark.sql.execution.datasources.FileFormatWriterSuite:
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]