This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 0922380 [SPARK-34768][SQL] Respect the default input buffer size in
Univocity
0922380 is described below
commit 0922380406f667ad11b0795ca63be2a8a21a7266
Author: HyukjinKwon <[email protected]>
AuthorDate: Wed Mar 17 19:55:49 2021 +0900
[SPARK-34768][SQL] Respect the default input buffer size in Univocity
### What changes were proposed in this pull request?
This PR proposes to follow Univocity's input buffer.
### Why are the changes needed?
- Firstly, it's best to trust their judgement on the default values. Also
128 is too low.
- Default values arguably have more test coverage in Univocity.
- It will also fix https://github.com/uniVocity/univocity-parsers/issues/449
- ^ is a regression compared to Spark 2.4
### Does this PR introduce _any_ user-facing change?
No. In addition, It fixes a regression.
### How was this patch tested?
Manually tested, and added a unit test.
Closes #31858 from HyukjinKwon/SPARK-34768.
Authored-by: HyukjinKwon <[email protected]>
Signed-off-by: HyukjinKwon <[email protected]>
(cherry picked from commit 385f1e8f5de5dcad62554cd75446e98c9380b384)
Signed-off-by: HyukjinKwon <[email protected]>
---
.../scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 3 ---
.../apache/spark/sql/execution/datasources/csv/CSVSuite.scala | 11 +++++++++++
2 files changed, 11 insertions(+), 3 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
index ec40599..c6a8061 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
@@ -166,8 +166,6 @@ class CSVOptions(
val quoteAll = getBool("quoteAll", false)
- val inputBufferSize = 128
-
/**
* The max error content length in CSV parser/writer exception message.
*/
@@ -259,7 +257,6 @@ class CSVOptions(
settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead)
settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead)
settings.setReadInputOnSeparateThread(false)
- settings.setInputBufferSize(inputBufferSize)
settings.setMaxColumns(maxColumns)
settings.setNullValue(nullValue)
settings.setEmptyValue(emptyValueInRead)
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 30f0e45..3fe6ce7 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -2452,6 +2452,17 @@ abstract class CSVSuite
assert(result.sameElements(exceptResults))
}
}
+
+ test("SPARK-34768: counting a long record with ignoreTrailingWhiteSpace set
to true") {
+ val bufSize = 128
+ val line = "X" * (bufSize - 1) + "| |"
+ withTempPath { path =>
+ Seq(line).toDF.write.text(path.getAbsolutePath)
+ assert(spark.read.format("csv")
+ .option("delimiter", "|")
+ .option("ignoreTrailingWhiteSpace",
"true").load(path.getAbsolutePath).count() == 1)
+ }
+ }
}
class CSVv1Suite extends CSVSuite {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]