Repository: spark
Updated Branches:
refs/heads/master d0ecff285 -> 1e6c1d8bf
[SPARK-25493][SQL] Use auto-detection for CRLF in CSV datasource multiline mode
## What changes were proposed in this pull request?
CSVs with windows style crlf ('\r\n') don't work in multiline mode. They work
fine in single line mode because the line separation is done by Hadoop, which
can handle all the different types of line separators. This PR fixes it by
enabling Univocity's line separator detection in multiline mode, which will
detect '\r\n', '\r', or '\n' automatically as it is done by hadoop in single
line mode.
## How was this patch tested?
Unit test with a file with crlf line endings.
Closes #22503 from justinuang/fix-clrf-multiline.
Authored-by: Justin Uang <[email protected]>
Signed-off-by: hyukjinkwon <[email protected]>
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1e6c1d8b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1e6c1d8b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1e6c1d8b
Branch: refs/heads/master
Commit: 1e6c1d8bfb7841596452e25b870823b9a4b267f4
Parents: d0ecff2
Author: Justin Uang <[email protected]>
Authored: Fri Oct 19 11:13:02 2018 +0800
Committer: hyukjinkwon <[email protected]>
Committed: Fri Oct 19 11:13:02 2018 +0800
----------------------------------------------------------------------
.../org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 2 ++
sql/core/src/test/resources/test-data/cars-crlf.csv | 7 +++++++
.../spark/sql/execution/datasources/csv/CSVSuite.scala | 12 ++++++++++++
3 files changed, 21 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/1e6c1d8b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
----------------------------------------------------------------------
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
index 3e25d82..cdaaa17 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
@@ -212,6 +212,8 @@ class CSVOptions(
settings.setEmptyValue(emptyValueInRead)
settings.setMaxCharsPerColumn(maxCharsPerColumn)
settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER)
+ settings.setLineSeparatorDetectionEnabled(multiLine == true)
+
settings
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/1e6c1d8b/sql/core/src/test/resources/test-data/cars-crlf.csv
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/test-data/cars-crlf.csv
b/sql/core/src/test/resources/test-data/cars-crlf.csv
new file mode 100644
index 0000000..d018d08
--- /dev/null
+++ b/sql/core/src/test/resources/test-data/cars-crlf.csv
@@ -0,0 +1,7 @@
+
+year,make,model,comment,blank
+"2012","Tesla","S","No comment",
+
+1997,Ford,E350,"Go get one now they are going fast",
+2015,Chevy,Volt
+
http://git-wip-us.apache.org/repos/asf/spark/blob/1e6c1d8b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
----------------------------------------------------------------------
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index d59035b..d43efc8 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -52,6 +52,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with
SQLTestUtils with Te
private val carsNullFile = "test-data/cars-null.csv"
private val carsEmptyValueFile = "test-data/cars-empty-value.csv"
private val carsBlankColName = "test-data/cars-blank-column-name.csv"
+ private val carsCrlf = "test-data/cars-crlf.csv"
private val emptyFile = "test-data/empty.csv"
private val commentsFile = "test-data/comments.csv"
private val disableCommentsFile = "test-data/disable_comments.csv"
@@ -220,6 +221,17 @@ class CSVSuite extends QueryTest with SharedSQLContext
with SQLTestUtils with Te
}
}
+ test("crlf line separators in multiline mode") {
+ val cars = spark
+ .read
+ .format("csv")
+ .option("multiLine", "true")
+ .option("header", "true")
+ .load(testFile(carsCrlf))
+
+ verifyCars(cars, withHeader = true)
+ }
+
test("test aliases sep and encoding for delimiter and charset") {
// scalastyle:off
val cars = spark
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]