Repository: spark Updated Branches: refs/heads/master 846bc61cf -> 95713eb4f
[SPARK-21804][SQL] json_tuple returns null values within repeated columns except the first one ## What changes were proposed in this pull request? When json_tuple in extracting values from JSON it returns null values within repeated columns except the first one as below: ``` scala scala> spark.sql("""SELECT json_tuple('{"a":1, "b":2}', 'a', 'b', 'a')""").show() +---+---+----+ | c0| c1| c2| +---+---+----+ | 1| 2|null| +---+---+----+ ``` I think this should be consistent with Hive's implementation: ``` hive> SELECT json_tuple('{"a": 1, "b": 2}', 'a', 'a'); ... 1 1 ``` In this PR, we located all the matched indices in `fieldNames` instead of returning the first matched index, i.e., indexOf. ## How was this patch tested? Added test in JsonExpressionsSuite. Author: Jen-Ming Chung <jenmingi...@gmail.com> Closes #19017 from jmchung/SPARK-21804. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/95713eb4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/95713eb4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/95713eb4 Branch: refs/heads/master Commit: 95713eb4f22de4e16617a605f74a1d6373ed270b Parents: 846bc61 Author: Jen-Ming Chung <jenmingi...@gmail.com> Authored: Thu Aug 24 19:24:00 2017 +0900 Committer: hyukjinkwon <gurwls...@gmail.com> Committed: Thu Aug 24 19:24:00 2017 +0900 ---------------------------------------------------------------------- .../sql/catalyst/expressions/jsonExpressions.scala | 12 ++++++++++-- .../sql/catalyst/expressions/JsonExpressionsSuite.scala | 10 ++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/95713eb4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index c375737..ee5da1a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -436,7 +436,8 @@ case class JsonTuple(children: Seq[Expression]) while (parser.nextToken() != JsonToken.END_OBJECT) { if (parser.getCurrentToken == JsonToken.FIELD_NAME) { // check to see if this field is desired in the output - val idx = fieldNames.indexOf(parser.getCurrentName) + val jsonField = parser.getCurrentName + var idx = fieldNames.indexOf(jsonField) if (idx >= 0) { // it is, copy the child tree to the correct location in the output row val output = new ByteArrayOutputStream() @@ -447,7 +448,14 @@ case class JsonTuple(children: Seq[Expression]) generator => copyCurrentStructure(generator, parser) } - row(idx) = UTF8String.fromBytes(output.toByteArray) + val jsonValue = UTF8String.fromBytes(output.toByteArray) + + // SPARK-21804: json_tuple returns null values within repeated columns + // except the first one; so that we need to check the remaining fields. + do { + row(idx) = jsonValue + idx = fieldNames.indexOf(jsonField, idx + 1) + } while (idx >= 0) } } } http://git-wip-us.apache.org/repos/asf/spark/blob/95713eb4/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala index 1cd2b4f..9991bda 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala @@ -373,6 +373,16 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("2"))) } + test("SPARK-21804: json_tuple returns null values within repeated columns except the first one") { + checkJsonTuple( + JsonTuple(Literal("""{"f1": 1, "f2": 2}""") :: + NonFoldableLiteral("f1") :: + NonFoldableLiteral("cast(NULL AS STRING)") :: + NonFoldableLiteral("f1") :: + Nil), + InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("1"))) + } + val gmtId = Option(DateTimeUtils.TimeZoneGMT.getID) test("from_json") { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org