This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 71022d7  [SPARK-31008][SQL] Support json_array_length function
71022d7 is described below

commit 71022d71307a7d7c8491a346843bb714f4530c71
Author: iRakson <[email protected]>
AuthorDate: Tue Apr 7 15:34:33 2020 -0700

    [SPARK-31008][SQL] Support json_array_length function
    
    ### What changes were proposed in this pull request?
    At the moment we do not have any function to compute length of JSON array 
directly.
    I propose a  `json_array_length` function which will return the length of 
the outermost JSON array.
    
    - This function will return length of the outermost JSON array, if JSON 
array is valid.
    ```
    
    scala> spark.sql("select 
json_array_length('[1,2,3,[33,44],{\"key\":[2,3,4]}]')").show
    +--------------------------------------------------+
    |json_array_length([1,2,3,[33,44],{"key":[2,3,4]}])|
    +--------------------------------------------------+
    |                                                 5|
    +--------------------------------------------------+
    
    scala> spark.sql("select json_array_length('[[1],[2,3]]')").show
    +------------------------------+
    |json_array_length([[1],[2,3]])|
    +------------------------------+
    |                             2|
    +------------------------------+
    
    ```
    - In case of any other valid JSON string, invalid JSON string or null array 
or `NULL` input , `NULL` will be returned.
    ```
    scala> spark.sql("select json_array_length('')").show
    +-------------------+
    |json_array_length()|
    +-------------------+
    |               null|
    +-------------------+
    ```
    
    ### Why are the changes needed?
    
    - As mentioned in JIRA, this function is supported by presto, postgreSQL, 
redshift, SQLite, MySQL, MariaDB, IBM DB2.
    
    - for better user experience and ease of use.
    
    ```
    Performance Result for Json array - [1, 2, 3, 4]
    
    Intel(R) Core(TM) i7-9750H CPU  2.60GHz
    JSON functions:                           Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
    
------------------------------------------------------------------------------------------------------------------------
    json_array_length                                  7728           7762      
    53          1.3         772.8       1.0X
    size+from_json                                    12739          12895      
   199          0.8        1273.9       0.6X
    
    ```
    
    ### Does this PR introduce any user-facing change?
    Yes, now users can get length of a json array by using `json_array_length`.
    
    ### How was this patch tested?
    Added UT.
    
    Closes #27759 from iRakson/jsonArrayLength.
    
    Authored-by: iRakson <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../sql/catalyst/analysis/FunctionRegistry.scala   |  1 +
 .../sql/catalyst/expressions/jsonExpressions.scala | 68 ++++++++++++++++
 .../expressions/JsonExpressionsSuite.scala         | 18 +++++
 .../resources/sql-tests/inputs/json-functions.sql  | 14 ++++
 .../sql-tests/results/json-functions.sql.out       | 92 +++++++++++++++++++++-
 5 files changed, 192 insertions(+), 1 deletion(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index c11186e..fb51fa7 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -538,6 +538,7 @@ object FunctionRegistry {
     expression[StructsToJson]("to_json"),
     expression[JsonToStructs]("from_json"),
     expression[SchemaOfJson]("schema_of_json"),
+    expression[LengthOfJsonArray]("json_array_length"),
 
     // cast
     expression[Cast]("cast"),
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 4c2a511..fbb11de 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -796,3 +796,71 @@ case class SchemaOfJson(
 
   override def prettyName: String = "schema_of_json"
 }
+
+/**
+ * A function that returns the number of elements in the outmost JSON array.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(jsonArray) - Returns the number of elements in the outmost 
JSON array.",
+  arguments = """
+    Arguments:
+      * jsonArray - A JSON array. `NULL` is returned in case of any other 
valid JSON string,
+          `NULL` or an invalid JSON.
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_('[1,2,3,4]');
+        4
+      > SELECT _FUNC_('[1,2,3,{"f1":1,"f2":[5,6]},4]');
+        5
+      > SELECT _FUNC_('[1,2');
+        NULL
+  """,
+  since = "3.1.0"
+)
+case class LengthOfJsonArray(child: Expression) extends UnaryExpression
+  with CodegenFallback with ExpectsInputTypes {
+
+  override def inputTypes: Seq[DataType] = Seq(StringType)
+  override def dataType: DataType = IntegerType
+  override def nullable: Boolean = true
+  override def prettyName: String = "json_array_length"
+
+  override def eval(input: InternalRow): Any = {
+    val json = child.eval(input).asInstanceOf[UTF8String]
+    // return null for null input
+    if (json == null) {
+      return null
+    }
+
+    try {
+      
Utils.tryWithResource(CreateJacksonParser.utf8String(SharedFactory.jsonFactory, 
json)) {
+        parser => {
+          // return null if null array is encountered.
+          if (parser.nextToken() == null) {
+            return null
+          }
+          // Parse the array to compute its length.
+          parseCounter(parser, input)
+        }
+      }
+    } catch {
+      case _: JsonProcessingException | _: IOException => null
+    }
+  }
+
+  private def parseCounter(parser: JsonParser, input: InternalRow): Any = {
+    var length = 0
+    // Only JSON array are supported for this function.
+    if (parser.currentToken != JsonToken.START_ARRAY) {
+      return null
+    }
+    // Keep traversing until the end of JSON array
+    while(parser.nextToken() != JsonToken.END_ARRAY) {
+      length += 1
+      // skip all the child of inner object or array
+      parser.skipChildren()
+    }
+    length
+  }
+}
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 90c4d8f..2536d28 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -813,4 +813,22 @@ class JsonExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper with
         checkDecimalInfer(_, """struct<d:decimal(7,3)>""")
     }
   }
+
+  test("Length of JSON array") {
+    Seq(
+      ("", null),
+      ("[1,2,3]", 3),
+      ("[]", 0),
+      ("[[1],[2,3],[]]", 3),
+      ("""[{"a":123},{"b":"hello"}]""", 2),
+      ("""[1,2,3,[33,44],{"key":[2,3,4]}]""", 5),
+      ("""[1,2,3,4,5""", null),
+      ("Random String", null),
+      ("""{"key":"not a json array"}""", null),
+      ("""{"key": 25}""", null)
+    ).foreach {
+      case(literal, expectedValue) =>
+        checkEvaluation(LengthOfJsonArray(Literal(literal)), expectedValue)
+    }
+  }
 }
diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql 
b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
index 6c14eee..06de798 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
@@ -58,5 +58,19 @@ select schema_of_json('{"c1":01, "c2":0.1}', 
map('allowNumericLeadingZeros', 'tr
 select schema_of_json(null);
 CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 
1, "b": 2}', 'a');
 SELECT schema_of_json(jsonField) FROM jsonTable;
+
+-- json_array_length
+select json_array_length(null);
+select json_array_length(2);
+select json_array_length();
+select json_array_length('');
+select json_array_length('[]');
+select json_array_length('[1,2,3]');
+select json_array_length('[[1,2],[5,6,7]]');
+select json_array_length('[{"a":123},{"b":"hello"}]');
+select json_array_length('[1,2,3,[33,44],{"key":[2,3,4]}]');
+select json_array_length('{"key":"not a json array"}');
+select json_array_length('[1,2,3,4,5');
+
 -- Clean up
 DROP VIEW IF EXISTS jsonTable;
diff --git 
a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out 
b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index 920b45a..135b18c 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 42
+-- Number of queries: 53
 
 
 -- !query
@@ -347,6 +347,96 @@ cannot resolve 'schema_of_json(jsontable.`jsonField`)' due 
to data type mismatch
 
 
 -- !query
+select json_array_length(null)
+-- !query schema
+struct<json_array_length(CAST(NULL AS STRING)):int>
+-- !query output
+NULL
+
+
+-- !query
+select json_array_length(2)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'json_array_length(2)' due to data type mismatch: argument 1 
requires string type, however, '2' is of int type.; line 1 pos 7
+
+
+-- !query
+select json_array_length()
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Invalid number of arguments for function json_array_length. Expected: 1; 
Found: 0; line 1 pos 7
+
+
+-- !query
+select json_array_length('')
+-- !query schema
+struct<json_array_length():int>
+-- !query output
+NULL
+
+
+-- !query
+select json_array_length('[]')
+-- !query schema
+struct<json_array_length([]):int>
+-- !query output
+0
+
+
+-- !query
+select json_array_length('[1,2,3]')
+-- !query schema
+struct<json_array_length([1,2,3]):int>
+-- !query output
+3
+
+
+-- !query
+select json_array_length('[[1,2],[5,6,7]]')
+-- !query schema
+struct<json_array_length([[1,2],[5,6,7]]):int>
+-- !query output
+2
+
+
+-- !query
+select json_array_length('[{"a":123},{"b":"hello"}]')
+-- !query schema
+struct<json_array_length([{"a":123},{"b":"hello"}]):int>
+-- !query output
+2
+
+
+-- !query
+select json_array_length('[1,2,3,[33,44],{"key":[2,3,4]}]')
+-- !query schema
+struct<json_array_length([1,2,3,[33,44],{"key":[2,3,4]}]):int>
+-- !query output
+5
+
+
+-- !query
+select json_array_length('{"key":"not a json array"}')
+-- !query schema
+struct<json_array_length({"key":"not a json array"}):int>
+-- !query output
+NULL
+
+
+-- !query
+select json_array_length('[1,2,3,4,5')
+-- !query schema
+struct<json_array_length([1,2,3,4,5):int>
+-- !query output
+NULL
+
+
+-- !query
 DROP VIEW IF EXISTS jsonTable
 -- !query schema
 struct<>


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to