Repository: spark
Updated Branches:
  refs/heads/master 8e039a755 -> c7eadb5e6


[SPARK-25660][SQL] Fix for the backward slash as CSV fields delimiter

## What changes were proposed in this pull request?

The PR addresses the exception raised on accessing chars out of delimiter 
string. In particular, the backward slash `\` as the CSV fields delimiter 
causes the following exception on reading `abc\1`:
```Scala
String index out of range: 1
java.lang.StringIndexOutOfBoundsException: String index out of range: 1
        at java.lang.String.charAt(String.java:658)
```
because `str.charAt(1)` tries to access a char out of `str` in `CSVUtils.toChar`

## How was this patch tested?

Added tests for empty string and string containing the backward slash to 
`CSVUtilsSuite`. Besides of that I added an end-to-end test to check how the 
backward slash is handled in reading CSV string with it.

Closes #22654 from MaxGekk/csv-slash-delim.

Authored-by: Maxim Gekk <maxim.g...@databricks.com>
Signed-off-by: gatorsmile <gatorsm...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c7eadb5e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c7eadb5e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c7eadb5e

Branch: refs/heads/master
Commit: c7eadb5e6652468f9d5cd714c112ba1de187eea8
Parents: 8e039a7
Author: Maxim Gekk <maxim.g...@databricks.com>
Authored: Fri Oct 12 12:04:00 2018 -0700
Committer: gatorsmile <gatorsm...@gmail.com>
Committed: Fri Oct 12 12:04:00 2018 -0700

----------------------------------------------------------------------
 .../execution/datasources/csv/CSVUtils.scala    | 36 +++++++++++---------
 .../execution/datasources/csv/CSVSuite.scala    | 10 ++++++
 .../datasources/csv/CSVUtilsSuite.scala         | 14 ++++++++
 3 files changed, 43 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/c7eadb5e/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
index b912f8a..0a7473c 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
@@ -139,23 +139,25 @@ object CSVUtils {
    */
   @throws[IllegalArgumentException]
   def toChar(str: String): Char = {
-    if (str.charAt(0) == '\\') {
-      str.charAt(1)
-      match {
-        case 't' => '\t'
-        case 'r' => '\r'
-        case 'b' => '\b'
-        case 'f' => '\f'
-        case '\"' => '\"' // In case user changes quote char and uses \" as 
delimiter in options
-        case '\'' => '\''
-        case 'u' if str == """\u0000""" => '\u0000'
-        case _ =>
-          throw new IllegalArgumentException(s"Unsupported special character 
for delimiter: $str")
-      }
-    } else if (str.length == 1) {
-      str.charAt(0)
-    } else {
-      throw new IllegalArgumentException(s"Delimiter cannot be more than one 
character: $str")
+    (str: Seq[Char]) match {
+      case Seq() => throw new IllegalArgumentException("Delimiter cannot be 
empty string")
+      case Seq('\\') => throw new IllegalArgumentException("Single backslash 
is prohibited." +
+        " It has special meaning as beginning of an escape sequence." +
+        " To get the backslash character, pass a string with two backslashes 
as the delimiter.")
+      case Seq(c) => c
+      case Seq('\\', 't') => '\t'
+      case Seq('\\', 'r') => '\r'
+      case Seq('\\', 'b') => '\b'
+      case Seq('\\', 'f') => '\f'
+      // In case user changes quote char and uses \" as delimiter in options
+      case Seq('\\', '\"') => '\"'
+      case Seq('\\', '\'') => '\''
+      case Seq('\\', '\\') => '\\'
+      case _ if str == """\u0000""" => '\u0000'
+      case Seq('\\', _) =>
+        throw new IllegalArgumentException(s"Unsupported special character for 
delimiter: $str")
+      case _ =>
+        throw new IllegalArgumentException(s"Delimiter cannot be more than one 
character: $str")
     }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/c7eadb5e/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 5d4746c..d59035b 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1826,4 +1826,14 @@ class CSVSuite extends QueryTest with SharedSQLContext 
with SQLTestUtils with Te
     val df = spark.read.option("enforceSchema", false).csv(input)
     checkAnswer(df, Row("1", "2"))
   }
+
+  test("using the backward slash as the delimiter") {
+    val input = Seq("""abc\1""").toDS()
+    val delimiter = """\\"""
+    checkAnswer(spark.read.option("delimiter", delimiter).csv(input), 
Row("abc", "1"))
+    checkAnswer(spark.read.option("inferSchema", true).option("delimiter", 
delimiter).csv(input),
+      Row("abc", 1))
+    val schema = new StructType().add("a", StringType).add("b", IntegerType)
+    checkAnswer(spark.read.schema(schema).option("delimiter", 
delimiter).csv(input), Row("abc", 1))
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/c7eadb5e/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala
index 221e44c..60fcbd2 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala
@@ -28,6 +28,7 @@ class CSVUtilsSuite extends SparkFunSuite {
     assert(CSVUtils.toChar("""\"""") === '\"')
     assert(CSVUtils.toChar("""\'""") === '\'')
     assert(CSVUtils.toChar("""\u0000""") === '\u0000')
+    assert(CSVUtils.toChar("""\\""") === '\\')
   }
 
   test("Does not accept delimiter larger than one character") {
@@ -44,4 +45,17 @@ class CSVUtilsSuite extends SparkFunSuite {
     assert(exception.getMessage.contains("Unsupported special character for 
delimiter"))
   }
 
+  test("string with one backward slash is prohibited") {
+    val exception = intercept[IllegalArgumentException]{
+      CSVUtils.toChar("""\""")
+    }
+    assert(exception.getMessage.contains("Single backslash is prohibited"))
+  }
+
+  test("output proper error message for empty string") {
+    val exception = intercept[IllegalArgumentException]{
+      CSVUtils.toChar("")
+    }
+    assert(exception.getMessage.contains("Delimiter cannot be empty string"))
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to