Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/20929#discussion_r190447608
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
---
@@ -2408,4 +2408,24 @@ class JsonSuite extends QueryTest with
SharedSQLContext with TestJsonData {
spark.read.option("mode", "PERMISSIVE").option("encoding",
"UTF-8").json(Seq(badJson).toDS()),
Row(badJson))
}
+
+ test("SPARK-23772 ignore column of all null values or empty array during
schema inference") {
+ withTempPath { tempDir =>
+ val path = tempDir.getAbsolutePath
+ Seq(
+ """{"a":null, "b":[null, null], "c":null, "d":[[], [null]],
"e":{}}""",
+ """{"a":null, "b":[null], "c":[], "d": [null, []], "e":{}}""",
+ """{"a":null, "b":[], "c":[], "d": null, "e":null}""")
+ .toDS().write.mode("overwrite").text(path)
+ val df = spark.read.format("json")
+ .option("dropFieldIfAllNull", true)
+ .load(path)
+ val expectedSchema = new StructType()
+ .add("a", NullType).add("b", NullType).add("c", NullType).add("d",
NullType)
+ .add("e", NullType)
+ assert(df.schema === expectedSchema)
--- End diff --
No, there's no explicit preference between them since the preferences are
diverted even in committers. It's fine to use one of them.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]