This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 2814293b289 [SPARK-45962][SQL] Remove `treatEmptyValuesAsNulls` and
use `nullValue` option instead in XML
2814293b289 is described below
commit 2814293b28967ba5f6fe819bee55a70c065f6c66
Author: Shujing Yang <[email protected]>
AuthorDate: Thu Nov 16 23:27:17 2023 -0800
[SPARK-45962][SQL] Remove `treatEmptyValuesAsNulls` and use `nullValue`
option instead in XML
### What changes were proposed in this pull request?
Remove treatEmptyValuesAsNulls and use nullValue option instead in XML
### Why are the changes needed?
Today, we offer two available options to handle null values. To enhance
user clarity and simplify usage, we propose consolidating these into a single
option. We recommend retaining the nullValue option due to its broader semantic
scope.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Unit tests
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #43852 from shujingyang-db/treatEmptyValue.
Authored-by: Shujing Yang <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala | 13 ++++++-------
.../apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala | 2 +-
.../org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala | 2 +-
.../org/apache/spark/sql/catalyst/xml/XmlOptions.scala | 2 --
.../spark/sql/execution/datasources/xml/XmlSuite.scala | 12 ++++++------
5 files changed, 14 insertions(+), 17 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
index b39b2e63526..dcf02bac1de 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
@@ -183,8 +183,8 @@ class StaxXmlParser(
(parser.peek, dataType) match {
case (_: StartElement, dt: DataType) => convertComplicatedType(dt,
attributes)
case (_: EndElement, _: StringType) =>
- // Empty. It's null if these are explicitly treated as null, or "" is
the null value
- if (options.treatEmptyValuesAsNulls || options.nullValue == "") {
+ // Empty. It's null if "" is the null value
+ if (options.nullValue == "") {
null
} else {
UTF8String.fromString("")
@@ -224,7 +224,8 @@ class StaxXmlParser(
parser.peek match {
case _: StartElement => convertComplicatedType(dataType, attributes)
case _: EndElement if data.isEmpty => null
- case _: EndElement if options.treatEmptyValuesAsNulls => null
+ // treat empty values as null
+ case _: EndElement if options.nullValue == "" => null
case _: EndElement => convertTo(data, dataType)
case _ => convertField(parser, dataType, attributes)
}
@@ -444,8 +445,7 @@ class StaxXmlParser(
private def castTo(
datum: String,
castType: DataType): Any = {
- if ((datum == options.nullValue) ||
- (options.treatEmptyValuesAsNulls && datum == "")) {
+ if (datum == options.nullValue || datum == null) {
null
} else {
castType match {
@@ -493,8 +493,7 @@ class StaxXmlParser(
} else {
datum
}
- if ((value == options.nullValue) ||
- (options.treatEmptyValuesAsNulls && value == "")) {
+ if (value == options.nullValue || value == null) {
null
} else {
dataType match {
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala
index d3b90564a75..654b78906bd 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala
@@ -96,7 +96,7 @@ object StaxXmlParserUtils {
attributes.map { attr =>
val key = options.attributePrefix + getName(attr.getName, options)
val value = attr.getValue match {
- case v if options.treatEmptyValuesAsNulls && v.trim.isEmpty => null
+ case v if (options.nullValue == "") && v.trim.isEmpty => null
case v => v
}
key -> value
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala
index 53439879772..14470aa5fac 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala
@@ -161,7 +161,7 @@ class XmlInferSchema(options: XmlOptions, caseSensitive:
Boolean)
parser.peek match {
case _: StartElement => inferObject(parser)
case _: EndElement if data.isEmpty => NullType
- case _: EndElement if options.treatEmptyValuesAsNulls => NullType
+ case _: EndElement if options.nullValue == "" => NullType
case _: EndElement => StringType
case _ => inferField(parser)
}
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala
index 742506d6fdc..a76d30460cc 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala
@@ -86,7 +86,6 @@ class XmlOptions(
val samplingRatio =
parameters.get(SAMPLING_RATIO).map(_.toDouble).getOrElse(1.0)
require(samplingRatio > 0, s"$SAMPLING_RATIO ($samplingRatio) should be
greater than 0")
val excludeAttributeFlag = getBool(EXCLUDE_ATTRIBUTE, false)
- val treatEmptyValuesAsNulls = getBool(TREAT_EMPTY_VALUE_AS_NULLS, false)
val attributePrefix =
parameters.getOrElse(ATTRIBUTE_PREFIX, XmlOptions.DEFAULT_ATTRIBUTE_PREFIX)
val valueTag = parameters.getOrElse(VALUE_TAG, XmlOptions.DEFAULT_VALUE_TAG)
@@ -188,7 +187,6 @@ object XmlOptions extends DataSourceOptions {
val DECLARATION = newOption("declaration")
val ARRAY_ELEMENT_NAME = newOption("arrayElementName")
val EXCLUDE_ATTRIBUTE = newOption("excludeAttribute")
- val TREAT_EMPTY_VALUE_AS_NULLS = newOption("treatEmptyValuesAsNulls")
val ATTRIBUTE_PREFIX = newOption("attributePrefix")
val VALUE_TAG = newOption("valueTag")
val NULL_VALUE = newOption("nullValue")
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
index 5a901dadff9..3fa53a0a1c1 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
@@ -77,7 +77,7 @@ class XmlSuite extends QueryTest with SharedSparkSession {
test("DSL test with xml having unbalanced datatypes") {
val results = spark.read
.option("rowTag", "ROW")
- .option("treatEmptyValuesAsNulls", "true")
+ .option("nullValue", "")
.option("multiLine", "true")
.xml(getTestResourcePath(resDir + "gps-empty-field.xml"))
@@ -440,7 +440,7 @@ class XmlSuite extends QueryTest with SharedSparkSession {
assert(getLines(xmlFile).count(_.contains("<foo>")) === 2)
}
- test("DSL save with nullValue and treatEmptyValuesAsNulls") {
+ test("DSL save with nullValue") {
val copyFilePath = getEmptyTempDir().resolve("books-copy.xml")
val books = spark.read
@@ -452,7 +452,7 @@ class XmlSuite extends QueryTest with SharedSparkSession {
val booksCopy = spark.read
.option("rowTag", "book")
- .option("treatEmptyValuesAsNulls", "true")
+ .option("nullValue", "")
.xml(copyFilePath.toString)
assert(booksCopy.count() === books.count())
@@ -741,7 +741,7 @@ class XmlSuite extends QueryTest with SharedSparkSession {
field("age", IntegerType))
val results = spark.read.schema(schema)
.option("rowTag", "ROW")
- .option("treatEmptyValuesAsNulls", true)
+ .option("nullValue", "")
.xml(getTestResourcePath(resDir + "null-numbers.xml"))
.select("name", "age")
.collect()
@@ -950,10 +950,10 @@ class XmlSuite extends QueryTest with SharedSparkSession {
"requirement failed: 'valueTag' and 'attributePrefix' options should not
be the same.")
}
- test("nullValue and treatEmptyValuesAsNulls test") {
+ test("nullValue test") {
val resultsOne = spark.read
.option("rowTag", "ROW")
- .option("treatEmptyValuesAsNulls", "true")
+ .option("nullValue", "")
.xml(getTestResourcePath(resDir + "gps-empty-field.xml"))
assert(resultsOne.selectExpr("extensions.TrackPointExtension").head().getStruct(0)
!== null)
assert(resultsOne.selectExpr("extensions.TrackPointExtension")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]