This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 71022d7 [SPARK-31008][SQL] Support json_array_length function
71022d7 is described below
commit 71022d71307a7d7c8491a346843bb714f4530c71
Author: iRakson <[email protected]>
AuthorDate: Tue Apr 7 15:34:33 2020 -0700
[SPARK-31008][SQL] Support json_array_length function
### What changes were proposed in this pull request?
At the moment we do not have any function to compute length of JSON array
directly.
I propose a `json_array_length` function which will return the length of
the outermost JSON array.
- This function will return length of the outermost JSON array, if JSON
array is valid.
```
scala> spark.sql("select
json_array_length('[1,2,3,[33,44],{\"key\":[2,3,4]}]')").show
+--------------------------------------------------+
|json_array_length([1,2,3,[33,44],{"key":[2,3,4]}])|
+--------------------------------------------------+
| 5|
+--------------------------------------------------+
scala> spark.sql("select json_array_length('[[1],[2,3]]')").show
+------------------------------+
|json_array_length([[1],[2,3]])|
+------------------------------+
| 2|
+------------------------------+
```
- In case of any other valid JSON string, invalid JSON string or null array
or `NULL` input , `NULL` will be returned.
```
scala> spark.sql("select json_array_length('')").show
+-------------------+
|json_array_length()|
+-------------------+
| null|
+-------------------+
```
### Why are the changes needed?
- As mentioned in JIRA, this function is supported by presto, postgreSQL,
redshift, SQLite, MySQL, MariaDB, IBM DB2.
- for better user experience and ease of use.
```
Performance Result for Json array - [1, 2, 3, 4]
Intel(R) Core(TM) i7-9750H CPU 2.60GHz
JSON functions: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
json_array_length 7728 7762
53 1.3 772.8 1.0X
size+from_json 12739 12895
199 0.8 1273.9 0.6X
```
### Does this PR introduce any user-facing change?
Yes, now users can get length of a json array by using `json_array_length`.
### How was this patch tested?
Added UT.
Closes #27759 from iRakson/jsonArrayLength.
Authored-by: iRakson <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../sql/catalyst/analysis/FunctionRegistry.scala | 1 +
.../sql/catalyst/expressions/jsonExpressions.scala | 68 ++++++++++++++++
.../expressions/JsonExpressionsSuite.scala | 18 +++++
.../resources/sql-tests/inputs/json-functions.sql | 14 ++++
.../sql-tests/results/json-functions.sql.out | 92 +++++++++++++++++++++-
5 files changed, 192 insertions(+), 1 deletion(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index c11186e..fb51fa7 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -538,6 +538,7 @@ object FunctionRegistry {
expression[StructsToJson]("to_json"),
expression[JsonToStructs]("from_json"),
expression[SchemaOfJson]("schema_of_json"),
+ expression[LengthOfJsonArray]("json_array_length"),
// cast
expression[Cast]("cast"),
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 4c2a511..fbb11de 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -796,3 +796,71 @@ case class SchemaOfJson(
override def prettyName: String = "schema_of_json"
}
+
+/**
+ * A function that returns the number of elements in the outmost JSON array.
+ */
+@ExpressionDescription(
+ usage = "_FUNC_(jsonArray) - Returns the number of elements in the outmost
JSON array.",
+ arguments = """
+ Arguments:
+ * jsonArray - A JSON array. `NULL` is returned in case of any other
valid JSON string,
+ `NULL` or an invalid JSON.
+ """,
+ examples = """
+ Examples:
+ > SELECT _FUNC_('[1,2,3,4]');
+ 4
+ > SELECT _FUNC_('[1,2,3,{"f1":1,"f2":[5,6]},4]');
+ 5
+ > SELECT _FUNC_('[1,2');
+ NULL
+ """,
+ since = "3.1.0"
+)
+case class LengthOfJsonArray(child: Expression) extends UnaryExpression
+ with CodegenFallback with ExpectsInputTypes {
+
+ override def inputTypes: Seq[DataType] = Seq(StringType)
+ override def dataType: DataType = IntegerType
+ override def nullable: Boolean = true
+ override def prettyName: String = "json_array_length"
+
+ override def eval(input: InternalRow): Any = {
+ val json = child.eval(input).asInstanceOf[UTF8String]
+ // return null for null input
+ if (json == null) {
+ return null
+ }
+
+ try {
+
Utils.tryWithResource(CreateJacksonParser.utf8String(SharedFactory.jsonFactory,
json)) {
+ parser => {
+ // return null if null array is encountered.
+ if (parser.nextToken() == null) {
+ return null
+ }
+ // Parse the array to compute its length.
+ parseCounter(parser, input)
+ }
+ }
+ } catch {
+ case _: JsonProcessingException | _: IOException => null
+ }
+ }
+
+ private def parseCounter(parser: JsonParser, input: InternalRow): Any = {
+ var length = 0
+ // Only JSON array are supported for this function.
+ if (parser.currentToken != JsonToken.START_ARRAY) {
+ return null
+ }
+ // Keep traversing until the end of JSON array
+ while(parser.nextToken() != JsonToken.END_ARRAY) {
+ length += 1
+ // skip all the child of inner object or array
+ parser.skipChildren()
+ }
+ length
+ }
+}
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 90c4d8f..2536d28 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -813,4 +813,22 @@ class JsonExpressionsSuite extends SparkFunSuite with
ExpressionEvalHelper with
checkDecimalInfer(_, """struct<d:decimal(7,3)>""")
}
}
+
+ test("Length of JSON array") {
+ Seq(
+ ("", null),
+ ("[1,2,3]", 3),
+ ("[]", 0),
+ ("[[1],[2,3],[]]", 3),
+ ("""[{"a":123},{"b":"hello"}]""", 2),
+ ("""[1,2,3,[33,44],{"key":[2,3,4]}]""", 5),
+ ("""[1,2,3,4,5""", null),
+ ("Random String", null),
+ ("""{"key":"not a json array"}""", null),
+ ("""{"key": 25}""", null)
+ ).foreach {
+ case(literal, expectedValue) =>
+ checkEvaluation(LengthOfJsonArray(Literal(literal)), expectedValue)
+ }
+ }
}
diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
index 6c14eee..06de798 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
@@ -58,5 +58,19 @@ select schema_of_json('{"c1":01, "c2":0.1}',
map('allowNumericLeadingZeros', 'tr
select schema_of_json(null);
CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a":
1, "b": 2}', 'a');
SELECT schema_of_json(jsonField) FROM jsonTable;
+
+-- json_array_length
+select json_array_length(null);
+select json_array_length(2);
+select json_array_length();
+select json_array_length('');
+select json_array_length('[]');
+select json_array_length('[1,2,3]');
+select json_array_length('[[1,2],[5,6,7]]');
+select json_array_length('[{"a":123},{"b":"hello"}]');
+select json_array_length('[1,2,3,[33,44],{"key":[2,3,4]}]');
+select json_array_length('{"key":"not a json array"}');
+select json_array_length('[1,2,3,4,5');
+
-- Clean up
DROP VIEW IF EXISTS jsonTable;
diff --git
a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index 920b45a..135b18c 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 42
+-- Number of queries: 53
-- !query
@@ -347,6 +347,96 @@ cannot resolve 'schema_of_json(jsontable.`jsonField`)' due
to data type mismatch
-- !query
+select json_array_length(null)
+-- !query schema
+struct<json_array_length(CAST(NULL AS STRING)):int>
+-- !query output
+NULL
+
+
+-- !query
+select json_array_length(2)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'json_array_length(2)' due to data type mismatch: argument 1
requires string type, however, '2' is of int type.; line 1 pos 7
+
+
+-- !query
+select json_array_length()
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Invalid number of arguments for function json_array_length. Expected: 1;
Found: 0; line 1 pos 7
+
+
+-- !query
+select json_array_length('')
+-- !query schema
+struct<json_array_length():int>
+-- !query output
+NULL
+
+
+-- !query
+select json_array_length('[]')
+-- !query schema
+struct<json_array_length([]):int>
+-- !query output
+0
+
+
+-- !query
+select json_array_length('[1,2,3]')
+-- !query schema
+struct<json_array_length([1,2,3]):int>
+-- !query output
+3
+
+
+-- !query
+select json_array_length('[[1,2],[5,6,7]]')
+-- !query schema
+struct<json_array_length([[1,2],[5,6,7]]):int>
+-- !query output
+2
+
+
+-- !query
+select json_array_length('[{"a":123},{"b":"hello"}]')
+-- !query schema
+struct<json_array_length([{"a":123},{"b":"hello"}]):int>
+-- !query output
+2
+
+
+-- !query
+select json_array_length('[1,2,3,[33,44],{"key":[2,3,4]}]')
+-- !query schema
+struct<json_array_length([1,2,3,[33,44],{"key":[2,3,4]}]):int>
+-- !query output
+5
+
+
+-- !query
+select json_array_length('{"key":"not a json array"}')
+-- !query schema
+struct<json_array_length({"key":"not a json array"}):int>
+-- !query output
+NULL
+
+
+-- !query
+select json_array_length('[1,2,3,4,5')
+-- !query schema
+struct<json_array_length([1,2,3,4,5):int>
+-- !query output
+NULL
+
+
+-- !query
DROP VIEW IF EXISTS jsonTable
-- !query schema
struct<>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]