This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.3 by this push:
new d63e42d128b [SPARK-38955][SQL] Disable lineSep option in 'from_csv'
and 'schema_of_csv'
d63e42d128b is described below
commit d63e42d128b8814e885b86533f187724fbb7e9fd
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Fri Apr 22 11:43:46 2022 +0900
[SPARK-38955][SQL] Disable lineSep option in 'from_csv' and 'schema_of_csv'
### What changes were proposed in this pull request?
This PR proposes to disable `lineSep` option in `from_csv` and
`schema_of_csv` expression by setting Noncharacters according to [unicode
specification](https://www.unicode.org/charts/PDF/UFFF0.pdf), `\UFFFF`. This
can be used for the internal purpose in a program according to the
specification.
The Univocity parser does not allow to omit the line separator (from my
code reading) so this approach was proposed.
This specific code path is not affected by our `encoding` or `charset`
option because Unicovity parser parses them as unicodes as are internally.
### Why are the changes needed?
Currently, this option is weirdly effective. See the example of `from_csv`
as below:
```scala
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
Seq[String]("1,\n2,3,4,5").toDF.select(
col("value"),
from_csv(
col("value"),
StructType(Seq(StructField("a", LongType), StructField("b", StringType)
)), Map[String,String]())).show()
```
```
+-----------+---------------+
| value|from_csv(value)|
+-----------+---------------+
|1,\n2,3,4,5| {1, null}|
+-----------+---------------+
```
`{1, null}` has to be `{1, \n2}`.
The CSV expressions cannot easily make it supported because this option is
plan-wise option that can change the number of returned rows; however, the
expressions are designed to emit one row only whereas this option is easily
effective in the scan plan with CSV data source. Therefore, we should disable
this option.
### Does this PR introduce _any_ user-facing change?
Yes, now the `lineSep` can be located in the output from `from_csv` and
`schema_of_csv`.
### How was this patch tested?
Manually tested, and unit test was added.
Closes #36294 from HyukjinKwon/SPARK-38955.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit f3cc2814d4bc585dad92c9eca9a593d1617d27e9)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../spark/sql/catalyst/expressions/csvExpressions.scala | 16 ++++++++++++++--
.../scala/org/apache/spark/sql/CsvFunctionsSuite.scala | 7 +++++++
2 files changed, 21 insertions(+), 2 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala
index 6e08ad346c8..9f38d4a30bf 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala
@@ -98,8 +98,14 @@ case class CsvToStructs(
val nameOfCorruptRecord =
SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD)
@transient lazy val parser = {
+ // 'lineSep' is a plan-wise option so we set a noncharacter, according to
+ // the unicode specification, which should not appear in Java's strings.
+ // See also SPARK-38955 and https://www.unicode.org/charts/PDF/UFFF0.pdf.
+ // scalastyle:off nonascii
+ val exprOptions = options ++ Map("lineSep" -> '\uFFFF'.toString)
+ // scalastyle:on nonascii
val parsedOptions = new CSVOptions(
- options,
+ exprOptions,
columnPruning = true,
defaultTimeZoneId = timeZoneId.get,
defaultColumnNameOfCorruptRecord = nameOfCorruptRecord)
@@ -186,7 +192,13 @@ case class SchemaOfCsv(
}
override def eval(v: InternalRow): Any = {
- val parsedOptions = new CSVOptions(options, true, "UTC")
+ // 'lineSep' is a plan-wise option so we set a noncharacter, according to
+ // the unicode specification, which should not appear in Java's strings.
+ // See also SPARK-38955 and https://www.unicode.org/charts/PDF/UFFF0.pdf.
+ // scalastyle:off nonascii
+ val exprOptions = options ++ Map("lineSep" -> '\uFFFF'.toString)
+ // scalastyle:on nonascii
+ val parsedOptions = new CSVOptions(exprOptions, true, "UTC")
val parser = new CsvParser(parsedOptions.asParserSettings)
val row = parser.parseLine(csv.toString)
assert(row != null, "Parsed CSV record should not be null.")
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
index b676c26023a..b683f3573b3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
@@ -364,4 +364,11 @@ class CsvFunctionsSuite extends QueryTest with
SharedSparkSession {
.selectExpr("value.a")
checkAnswer(fromCsvDF, Row(null))
}
+
+ test("SPARK-38955: disable lineSep option in from_csv and schema_of_csv") {
+ val df = Seq[String]("1,2\n2").toDF("csv")
+ val actual = df.select(from_csv(
+ $"csv", schema_of_csv("1,2\n2"), Map.empty[String, String].asJava))
+ checkAnswer(actual, Row(Row(1, "2\n2")))
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]