This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 965f872500a [SPARK-37575][SQL][FOLLOWUP] Add legacy flag for the
breaking change of write null value in csv to unquoted empty string
965f872500a is described below
commit 965f872500a3554142cab3078a7a4d513d2d2ee8
Author: Xinyi Yu <[email protected]>
AuthorDate: Fri Apr 15 16:45:47 2022 +0800
[SPARK-37575][SQL][FOLLOWUP] Add legacy flag for the breaking change of
write null value in csv to unquoted empty string
### What changes were proposed in this pull request?
Add a legacy flag `spark.sql.legacy.nullValueWrittenAsQuotedEmptyStringCsv`
for the breaking change introduced in
https://github.com/apache/spark/pull/34853 and
https://github.com/apache/spark/pull/34905 (followup).
The flag is disabled by default, so the null values written as csv will
output an unquoted empty string. When the legacy flag is enabled, the null will
output quoted empty string.
### Why are the changes needed?
The original commit is a breaking change, and breaking changes should be
encouraged to add a flag to turn it off for smooth migration between versions.
### Does this PR introduce _any_ user-facing change?
With the default value of the conf, there is no user-facing difference.
If users turn this conf off, they can restore the pre-change behavior.
### How was this patch tested?
Through unit tests.
Closes #36110 from anchovYu/flags-null-to-csv.
Authored-by: Xinyi Yu <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../spark/sql/catalyst/csv/UnivocityGenerator.scala | 4 ++++
.../org/apache/spark/sql/internal/SQLConf.scala | 10 ++++++++++
.../sql/execution/datasources/csv/CSVSuite.scala | 20 ++++++++++++++------
3 files changed, 28 insertions(+), 6 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala
index 5dd8c35e4c2..d124a055f63 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala
@@ -24,6 +24,7 @@ import com.univocity.parsers.csv.CsvWriter
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils,
IntervalStringStyles, IntervalUtils, TimestampFormatter}
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
+import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
class UnivocityGenerator(
@@ -95,6 +96,9 @@ class UnivocityGenerator(
while (i < row.numFields) {
if (!row.isNullAt(i)) {
values(i) = valueConverters(i).apply(row, i)
+ } else if (
+
SQLConf.get.getConf(SQLConf.LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV))
{
+ values(i) = options.nullValue
}
i += 1
}
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index ac2a2e350c6..36b666fd59c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -3754,6 +3754,16 @@ object SQLConf {
.booleanConf
.createWithDefault(false)
+ val LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV =
+ buildConf("spark.sql.legacy.nullValueWrittenAsQuotedEmptyStringCsv")
+ .internal()
+ .doc("When set to false, nulls are written as unquoted empty strings in
CSV data source. " +
+ "If set to false, it restores the legacy behavior that nulls were
written as quoted " +
+ "empty strings, `\"\"`.")
+ .version("3.3.0")
+ .booleanConf
+ .createWithDefault(false)
+
/**
* Holds information about keys that have been deprecated.
*
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 819bb430173..9637a85ea35 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -807,12 +807,20 @@ abstract class CSVSuite
test("SPARK-37575: null values should be saved as nothing rather than " +
"quoted empty Strings \"\" with default settings") {
- withTempPath { path =>
- Seq(("Tesla", null: String, ""))
- .toDF("make", "comment", "blank")
- .write
- .csv(path.getCanonicalPath)
- checkAnswer(spark.read.text(path.getCanonicalPath), Row("Tesla,,\"\""))
+ Seq("true", "false").foreach { confVal =>
+
withSQLConf(SQLConf.LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV.key ->
confVal) {
+ withTempPath { path =>
+ Seq(("Tesla", null: String, ""))
+ .toDF("make", "comment", "blank")
+ .write
+ .csv(path.getCanonicalPath)
+ if (confVal == "false") {
+ checkAnswer(spark.read.text(path.getCanonicalPath),
Row("Tesla,,\"\""))
+ } else {
+ checkAnswer(spark.read.text(path.getCanonicalPath),
Row("Tesla,\"\",\"\""))
+ }
+ }
+ }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]