spark git commit: [SPARK-16548][SQL] Inconsistent error handling in JSON parsing SQL functions

2017-04-25 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/master caf392025 -> 57e1da394


[SPARK-16548][SQL] Inconsistent error handling in JSON parsing SQL functions

## What changes were proposed in this pull request?

change to using Jackson's `com.fasterxml.jackson.core.JsonFactory`

public JsonParser createParser(String content)

## How was this patch tested?

existing unit tests

Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Author: Eric Wasserman 

Closes #17693 from ewasserman/SPARK-20314.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57e1da39
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57e1da39
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57e1da39

Branch: refs/heads/master
Commit: 57e1da39464131329318b723caa54df9f55fa54f
Parents: caf3920
Author: Eric Wasserman 
Authored: Wed Apr 26 11:42:43 2017 +0800
Committer: Wenchen Fan 
Committed: Wed Apr 26 11:42:43 2017 +0800

--
 .../sql/catalyst/expressions/jsonExpressions.scala | 12 +---
 .../expressions/JsonExpressionsSuite.scala | 17 +
 2 files changed, 26 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/57e1da39/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index df4d406..9fb0ea6 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.io.{ByteArrayOutputStream, CharArrayWriter, StringWriter}
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, CharArrayWriter, 
InputStreamReader, StringWriter}
 
 import scala.util.parsing.combinator.RegexParsers
 
@@ -149,7 +149,10 @@ case class GetJsonObject(json: Expression, path: 
Expression)
 
 if (parsed.isDefined) {
   try {
-Utils.tryWithResource(jsonFactory.createParser(jsonStr.getBytes)) { 
parser =>
+/* We know the bytes are UTF-8 encoded. Pass a Reader to avoid having 
Jackson
+  detect character encoding which could fail for some malformed 
strings */
+Utils.tryWithResource(jsonFactory.createParser(new InputStreamReader(
+new ByteArrayInputStream(jsonStr.getBytes), "UTF-8"))) { parser =>
   val output = new ByteArrayOutputStream()
   val matched = Utils.tryWithResource(
 jsonFactory.createGenerator(output, JsonEncoding.UTF8)) { 
generator =>
@@ -393,7 +396,10 @@ case class JsonTuple(children: Seq[Expression])
 }
 
 try {
-  Utils.tryWithResource(jsonFactory.createParser(json.getBytes)) {
+  /* We know the bytes are UTF-8 encoded. Pass a Reader to avoid having 
Jackson
+  detect character encoding which could fail for some malformed strings */
+  Utils.tryWithResource(jsonFactory.createParser(new InputStreamReader(
+  new ByteArrayInputStream(json.getBytes), "UTF-8"))) {
 parser => parseRow(parser, input)
   }
 } catch {

http://git-wip-us.apache.org/repos/asf/spark/blob/57e1da39/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index c5b7223..4402ad4 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -39,6 +39,10 @@ class JsonExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
   |"fb:testid":"1234"}
   |""".stripMargin
 
+  /* invalid json with leading nulls would trigger 
java.io.CharConversionException
+   in Jackson's JsonFactory.createParser(byte[]) due to RFC-4627 encoding 
detection */
+  val badJson = "\0\0\0A\1AAA"
+
   test("$.store.bicycle") {
 checkEvaluation(
   GetJsonObject(Literal(json), Literal("$.store.bicycle")),
@@ -224,6 +228,13 @@ class JsonExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
   null)
   }
 
+  test("SPARK-16548: character conversion") {
+checkEvaluation(
+  GetJsonObject(Literal(badJson), Literal("$.a")),
+   

spark git commit: [SPARK-16548][SQL] Inconsistent error handling in JSON parsing SQL functions

2017-04-25 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 f0de60079 -> c8803c068


[SPARK-16548][SQL] Inconsistent error handling in JSON parsing SQL functions

## What changes were proposed in this pull request?

change to using Jackson's `com.fasterxml.jackson.core.JsonFactory`

public JsonParser createParser(String content)

## How was this patch tested?

existing unit tests

Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Author: Eric Wasserman 

Closes #17693 from ewasserman/SPARK-20314.

(cherry picked from commit 57e1da39464131329318b723caa54df9f55fa54f)
Signed-off-by: Wenchen Fan 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c8803c06
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c8803c06
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c8803c06

Branch: refs/heads/branch-2.2
Commit: c8803c06854683c8761fdb3c0e4c55d5a9e22a95
Parents: f0de600
Author: Eric Wasserman 
Authored: Wed Apr 26 11:42:43 2017 +0800
Committer: Wenchen Fan 
Committed: Wed Apr 26 11:43:03 2017 +0800

--
 .../sql/catalyst/expressions/jsonExpressions.scala | 12 +---
 .../expressions/JsonExpressionsSuite.scala | 17 +
 2 files changed, 26 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c8803c06/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index df4d406..9fb0ea6 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.io.{ByteArrayOutputStream, CharArrayWriter, StringWriter}
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, CharArrayWriter, 
InputStreamReader, StringWriter}
 
 import scala.util.parsing.combinator.RegexParsers
 
@@ -149,7 +149,10 @@ case class GetJsonObject(json: Expression, path: 
Expression)
 
 if (parsed.isDefined) {
   try {
-Utils.tryWithResource(jsonFactory.createParser(jsonStr.getBytes)) { 
parser =>
+/* We know the bytes are UTF-8 encoded. Pass a Reader to avoid having 
Jackson
+  detect character encoding which could fail for some malformed 
strings */
+Utils.tryWithResource(jsonFactory.createParser(new InputStreamReader(
+new ByteArrayInputStream(jsonStr.getBytes), "UTF-8"))) { parser =>
   val output = new ByteArrayOutputStream()
   val matched = Utils.tryWithResource(
 jsonFactory.createGenerator(output, JsonEncoding.UTF8)) { 
generator =>
@@ -393,7 +396,10 @@ case class JsonTuple(children: Seq[Expression])
 }
 
 try {
-  Utils.tryWithResource(jsonFactory.createParser(json.getBytes)) {
+  /* We know the bytes are UTF-8 encoded. Pass a Reader to avoid having 
Jackson
+  detect character encoding which could fail for some malformed strings */
+  Utils.tryWithResource(jsonFactory.createParser(new InputStreamReader(
+  new ByteArrayInputStream(json.getBytes), "UTF-8"))) {
 parser => parseRow(parser, input)
   }
 } catch {

http://git-wip-us.apache.org/repos/asf/spark/blob/c8803c06/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index c5b7223..4402ad4 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -39,6 +39,10 @@ class JsonExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
   |"fb:testid":"1234"}
   |""".stripMargin
 
+  /* invalid json with leading nulls would trigger 
java.io.CharConversionException
+   in Jackson's JsonFactory.createParser(byte[]) due to RFC-4627 encoding 
detection */
+  val badJson = "\0\0\0A\1AAA"
+
   test("$.store.bicycle") {
 checkEvaluation(
   GetJsonObject(Literal(json), Literal("$.store.bicycle")),
@@ -224,6 +228,13 @@ class JsonExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
   null)
   }
 
+  test("SPARK-16548: c