This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new e3e33d8 [SPARK-26372][SQL] Don't reuse value from previous row when
parsing bad CSV input field
e3e33d8 is described below
commit e3e33d8794da5f3597b8d706b734af5025360939
Author: Bruce Robbins <[email protected]>
AuthorDate: Sun Dec 16 11:02:00 2018 +0800
[SPARK-26372][SQL] Don't reuse value from previous row when parsing bad CSV
input field
## What changes were proposed in this pull request?
CSV parsing accidentally uses the previous good value for a bad input
field. See example in Jira.
This PR ensures that the associated column is set to null when an input
field cannot be converted.
## How was this patch tested?
Added new test.
Ran all SQL unit tests (testOnly org.apache.spark.sql.*).
Ran pyspark tests for pyspark-sql
Closes #23323 from bersprockets/csv-bad-field.
Authored-by: Bruce Robbins <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../spark/sql/catalyst/csv/UnivocityParser.scala | 1 +
.../src/test/resources/test-data/bad_after_good.csv | 2 ++
.../sql/execution/datasources/csv/CSVSuite.scala | 19 +++++++++++++++++++
3 files changed, 22 insertions(+)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
index ed08912..82a5b3c 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
@@ -239,6 +239,7 @@ class UnivocityParser(
} catch {
case NonFatal(e) =>
badRecordException = badRecordException.orElse(Some(e))
+ row.setNullAt(i)
}
i += 1
}
diff --git a/sql/core/src/test/resources/test-data/bad_after_good.csv
b/sql/core/src/test/resources/test-data/bad_after_good.csv
new file mode 100644
index 0000000..4621a7d
--- /dev/null
+++ b/sql/core/src/test/resources/test-data/bad_after_good.csv
@@ -0,0 +1,2 @@
+"good record",1999-08-01
+"bad record",1999-088-01
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 3b977d7..d9e5d7a 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -63,6 +63,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with
SQLTestUtils with Te
private val datesFile = "test-data/dates.csv"
private val unescapedQuotesFile = "test-data/unescaped-quotes.csv"
private val valueMalformedFile = "test-data/value-malformed.csv"
+ private val badAfterGoodFile = "test-data/bad_after_good.csv"
/** Verifies data and schema. */
private def verifyCars(
@@ -2012,4 +2013,22 @@ class CSVSuite extends QueryTest with SharedSQLContext
with SQLTestUtils with Te
assert(!files.exists(_.getName.endsWith("csv")))
}
}
+
+ test("Do not reuse last good value for bad input field") {
+ val schema = StructType(
+ StructField("col1", StringType) ::
+ StructField("col2", DateType) ::
+ Nil
+ )
+ val rows = spark.read
+ .schema(schema)
+ .format("csv")
+ .load(testFile(badAfterGoodFile))
+
+ val expectedRows = Seq(
+ Row("good record", java.sql.Date.valueOf("1999-08-01")),
+ Row("bad record", null))
+
+ checkAnswer(rows, expectedRows)
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]