Repository: spark
Updated Branches:
refs/heads/branch-2.4 bb211cf27 -> 1a335444e
[SPARK-25660][SQL] Fix for the backward slash as CSV fields delimiter
## What changes were proposed in this pull request?
The PR addresses the exception raised on accessing chars out of delimiter
string. In particular, the backward slash `\` as the CSV fields delimiter
causes the following exception on reading `abc\1`:
```Scala
String index out of range: 1
java.lang.StringIndexOutOfBoundsException: String index out of range: 1
at java.lang.String.charAt(String.java:658)
```
because `str.charAt(1)` tries to access a char out of `str` in `CSVUtils.toChar`
## How was this patch tested?
Added tests for empty string and string containing the backward slash to
`CSVUtilsSuite`. Besides of that I added an end-to-end test to check how the
backward slash is handled in reading CSV string with it.
Closes #22654 from MaxGekk/csv-slash-delim.
Authored-by: Maxim Gekk <[email protected]>
Signed-off-by: gatorsmile <[email protected]>
(cherry picked from commit c7eadb5e6652468f9d5cd714c112ba1de187eea8)
Signed-off-by: gatorsmile <[email protected]>
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a335444
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a335444
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a335444
Branch: refs/heads/branch-2.4
Commit: 1a335444e6ba4124bd0f7f351f097c0bdb46ae85
Parents: bb211cf
Author: Maxim Gekk <[email protected]>
Authored: Fri Oct 12 12:04:00 2018 -0700
Committer: gatorsmile <[email protected]>
Committed: Fri Oct 12 12:04:16 2018 -0700
----------------------------------------------------------------------
.../execution/datasources/csv/CSVUtils.scala | 36 +++++++++++---------
.../execution/datasources/csv/CSVSuite.scala | 10 ++++++
.../datasources/csv/CSVUtilsSuite.scala | 14 ++++++++
3 files changed, 43 insertions(+), 17 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/1a335444/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
----------------------------------------------------------------------
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
index 7ce65fa..b367b3d 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
@@ -97,23 +97,25 @@ object CSVUtils {
*/
@throws[IllegalArgumentException]
def toChar(str: String): Char = {
- if (str.charAt(0) == '\\') {
- str.charAt(1)
- match {
- case 't' => '\t'
- case 'r' => '\r'
- case 'b' => '\b'
- case 'f' => '\f'
- case '\"' => '\"' // In case user changes quote char and uses \" as
delimiter in options
- case '\'' => '\''
- case 'u' if str == """\u0000""" => '\u0000'
- case _ =>
- throw new IllegalArgumentException(s"Unsupported special character
for delimiter: $str")
- }
- } else if (str.length == 1) {
- str.charAt(0)
- } else {
- throw new IllegalArgumentException(s"Delimiter cannot be more than one
character: $str")
+ (str: Seq[Char]) match {
+ case Seq() => throw new IllegalArgumentException("Delimiter cannot be
empty string")
+ case Seq('\\') => throw new IllegalArgumentException("Single backslash
is prohibited." +
+ " It has special meaning as beginning of an escape sequence." +
+ " To get the backslash character, pass a string with two backslashes
as the delimiter.")
+ case Seq(c) => c
+ case Seq('\\', 't') => '\t'
+ case Seq('\\', 'r') => '\r'
+ case Seq('\\', 'b') => '\b'
+ case Seq('\\', 'f') => '\f'
+ // In case user changes quote char and uses \" as delimiter in options
+ case Seq('\\', '\"') => '\"'
+ case Seq('\\', '\'') => '\''
+ case Seq('\\', '\\') => '\\'
+ case _ if str == """\u0000""" => '\u0000'
+ case Seq('\\', _) =>
+ throw new IllegalArgumentException(s"Unsupported special character for
delimiter: $str")
+ case _ =>
+ throw new IllegalArgumentException(s"Delimiter cannot be more than one
character: $str")
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/1a335444/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
----------------------------------------------------------------------
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 5d4746c..d59035b 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1826,4 +1826,14 @@ class CSVSuite extends QueryTest with SharedSQLContext
with SQLTestUtils with Te
val df = spark.read.option("enforceSchema", false).csv(input)
checkAnswer(df, Row("1", "2"))
}
+
+ test("using the backward slash as the delimiter") {
+ val input = Seq("""abc\1""").toDS()
+ val delimiter = """\\"""
+ checkAnswer(spark.read.option("delimiter", delimiter).csv(input),
Row("abc", "1"))
+ checkAnswer(spark.read.option("inferSchema", true).option("delimiter",
delimiter).csv(input),
+ Row("abc", 1))
+ val schema = new StructType().add("a", StringType).add("b", IntegerType)
+ checkAnswer(spark.read.schema(schema).option("delimiter",
delimiter).csv(input), Row("abc", 1))
+ }
}
http://git-wip-us.apache.org/repos/asf/spark/blob/1a335444/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala
----------------------------------------------------------------------
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala
index 221e44c..60fcbd2 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala
@@ -28,6 +28,7 @@ class CSVUtilsSuite extends SparkFunSuite {
assert(CSVUtils.toChar("""\"""") === '\"')
assert(CSVUtils.toChar("""\'""") === '\'')
assert(CSVUtils.toChar("""\u0000""") === '\u0000')
+ assert(CSVUtils.toChar("""\\""") === '\\')
}
test("Does not accept delimiter larger than one character") {
@@ -44,4 +45,17 @@ class CSVUtilsSuite extends SparkFunSuite {
assert(exception.getMessage.contains("Unsupported special character for
delimiter"))
}
+ test("string with one backward slash is prohibited") {
+ val exception = intercept[IllegalArgumentException]{
+ CSVUtils.toChar("""\""")
+ }
+ assert(exception.getMessage.contains("Single backslash is prohibited"))
+ }
+
+ test("output proper error message for empty string") {
+ val exception = intercept[IllegalArgumentException]{
+ CSVUtils.toChar("")
+ }
+ assert(exception.getMessage.contains("Delimiter cannot be empty string"))
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]