This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 2814293b289 [SPARK-45962][SQL] Remove `treatEmptyValuesAsNulls` and use `nullValue` option instead in XML 2814293b289 is described below commit 2814293b28967ba5f6fe819bee55a70c065f6c66 Author: Shujing Yang <shujing.y...@databricks.com> AuthorDate: Thu Nov 16 23:27:17 2023 -0800 [SPARK-45962][SQL] Remove `treatEmptyValuesAsNulls` and use `nullValue` option instead in XML ### What changes were proposed in this pull request? Remove treatEmptyValuesAsNulls and use nullValue option instead in XML ### Why are the changes needed? Today, we offer two available options to handle null values. To enhance user clarity and simplify usage, we propose consolidating these into a single option. We recommend retaining the nullValue option due to its broader semantic scope. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Unit tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #43852 from shujingyang-db/treatEmptyValue. Authored-by: Shujing Yang <shujing.y...@databricks.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala | 13 ++++++------- .../apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala | 2 +- .../org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala | 2 +- .../org/apache/spark/sql/catalyst/xml/XmlOptions.scala | 2 -- .../spark/sql/execution/datasources/xml/XmlSuite.scala | 12 ++++++------ 5 files changed, 14 insertions(+), 17 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala index b39b2e63526..dcf02bac1de 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala @@ -183,8 +183,8 @@ class StaxXmlParser( (parser.peek, dataType) match { case (_: StartElement, dt: DataType) => convertComplicatedType(dt, attributes) case (_: EndElement, _: StringType) => - // Empty. It's null if these are explicitly treated as null, or "" is the null value - if (options.treatEmptyValuesAsNulls || options.nullValue == "") { + // Empty. It's null if "" is the null value + if (options.nullValue == "") { null } else { UTF8String.fromString("") @@ -224,7 +224,8 @@ class StaxXmlParser( parser.peek match { case _: StartElement => convertComplicatedType(dataType, attributes) case _: EndElement if data.isEmpty => null - case _: EndElement if options.treatEmptyValuesAsNulls => null + // treat empty values as null + case _: EndElement if options.nullValue == "" => null case _: EndElement => convertTo(data, dataType) case _ => convertField(parser, dataType, attributes) } @@ -444,8 +445,7 @@ class StaxXmlParser( private def castTo( datum: String, castType: DataType): Any = { - if ((datum == options.nullValue) || - (options.treatEmptyValuesAsNulls && datum == "")) { + if (datum == options.nullValue || datum == null) { null } else { castType match { @@ -493,8 +493,7 @@ class StaxXmlParser( } else { datum } - if ((value == options.nullValue) || - (options.treatEmptyValuesAsNulls && value == "")) { + if (value == options.nullValue || value == null) { null } else { dataType match { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala index d3b90564a75..654b78906bd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala @@ -96,7 +96,7 @@ object StaxXmlParserUtils { attributes.map { attr => val key = options.attributePrefix + getName(attr.getName, options) val value = attr.getValue match { - case v if options.treatEmptyValuesAsNulls && v.trim.isEmpty => null + case v if (options.nullValue == "") && v.trim.isEmpty => null case v => v } key -> value diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala index 53439879772..14470aa5fac 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala @@ -161,7 +161,7 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean) parser.peek match { case _: StartElement => inferObject(parser) case _: EndElement if data.isEmpty => NullType - case _: EndElement if options.treatEmptyValuesAsNulls => NullType + case _: EndElement if options.nullValue == "" => NullType case _: EndElement => StringType case _ => inferField(parser) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala index 742506d6fdc..a76d30460cc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala @@ -86,7 +86,6 @@ class XmlOptions( val samplingRatio = parameters.get(SAMPLING_RATIO).map(_.toDouble).getOrElse(1.0) require(samplingRatio > 0, s"$SAMPLING_RATIO ($samplingRatio) should be greater than 0") val excludeAttributeFlag = getBool(EXCLUDE_ATTRIBUTE, false) - val treatEmptyValuesAsNulls = getBool(TREAT_EMPTY_VALUE_AS_NULLS, false) val attributePrefix = parameters.getOrElse(ATTRIBUTE_PREFIX, XmlOptions.DEFAULT_ATTRIBUTE_PREFIX) val valueTag = parameters.getOrElse(VALUE_TAG, XmlOptions.DEFAULT_VALUE_TAG) @@ -188,7 +187,6 @@ object XmlOptions extends DataSourceOptions { val DECLARATION = newOption("declaration") val ARRAY_ELEMENT_NAME = newOption("arrayElementName") val EXCLUDE_ATTRIBUTE = newOption("excludeAttribute") - val TREAT_EMPTY_VALUE_AS_NULLS = newOption("treatEmptyValuesAsNulls") val ATTRIBUTE_PREFIX = newOption("attributePrefix") val VALUE_TAG = newOption("valueTag") val NULL_VALUE = newOption("nullValue") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala index 5a901dadff9..3fa53a0a1c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala @@ -77,7 +77,7 @@ class XmlSuite extends QueryTest with SharedSparkSession { test("DSL test with xml having unbalanced datatypes") { val results = spark.read .option("rowTag", "ROW") - .option("treatEmptyValuesAsNulls", "true") + .option("nullValue", "") .option("multiLine", "true") .xml(getTestResourcePath(resDir + "gps-empty-field.xml")) @@ -440,7 +440,7 @@ class XmlSuite extends QueryTest with SharedSparkSession { assert(getLines(xmlFile).count(_.contains("<foo>")) === 2) } - test("DSL save with nullValue and treatEmptyValuesAsNulls") { + test("DSL save with nullValue") { val copyFilePath = getEmptyTempDir().resolve("books-copy.xml") val books = spark.read @@ -452,7 +452,7 @@ class XmlSuite extends QueryTest with SharedSparkSession { val booksCopy = spark.read .option("rowTag", "book") - .option("treatEmptyValuesAsNulls", "true") + .option("nullValue", "") .xml(copyFilePath.toString) assert(booksCopy.count() === books.count()) @@ -741,7 +741,7 @@ class XmlSuite extends QueryTest with SharedSparkSession { field("age", IntegerType)) val results = spark.read.schema(schema) .option("rowTag", "ROW") - .option("treatEmptyValuesAsNulls", true) + .option("nullValue", "") .xml(getTestResourcePath(resDir + "null-numbers.xml")) .select("name", "age") .collect() @@ -950,10 +950,10 @@ class XmlSuite extends QueryTest with SharedSparkSession { "requirement failed: 'valueTag' and 'attributePrefix' options should not be the same.") } - test("nullValue and treatEmptyValuesAsNulls test") { + test("nullValue test") { val resultsOne = spark.read .option("rowTag", "ROW") - .option("treatEmptyValuesAsNulls", "true") + .option("nullValue", "") .xml(getTestResourcePath(resDir + "gps-empty-field.xml")) assert(resultsOne.selectExpr("extensions.TrackPointExtension").head().getStruct(0) !== null) assert(resultsOne.selectExpr("extensions.TrackPointExtension") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org