HyukjinKwon commented on pull request #29172:
URL: https://github.com/apache/spark/pull/29172#issuecomment-661604611
Okay, I realised that why it's tricky. My concern about this patch is,
`originalMap` will lose the duplicated instances when it adds now.
I came up with a bit different approach with this:
```diff
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index a7b3d08ec4d..0d8691e3f05 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -238,7 +238,8 @@ class DataFrameReader private[sql](sparkSession:
SparkSession) extends Logging {
Some("paths" -> objectMapper.writeValueAsString(paths.toArray))
}
- val finalOptions = sessionOptions ++ extraOptions.toMap ++ pathsOption
+ val finalOptions = collection.immutable.ListMap.empty[String, String]
++
+ sessionOptions ++ extraOptions ++ pathsOption
val dsOptions = new CaseInsensitiveStringMap(finalOptions.asJava)
val (table, catalog, ident) = provider match {
case _: SupportsCatalogOptions if userSpecifiedSchema.nonEmpty =>
@@ -276,7 +277,8 @@ class DataFrameReader private[sql](sparkSession:
SparkSession) extends Logging {
paths = paths,
userSpecifiedSchema = userSpecifiedSchema,
className = source,
- options = extraOptions.toMap).resolveRelation())
+ options = collection.immutable.ListMap.empty[String, String] ++
extraOptions
+ ).resolveRelation())
}
/**
@@ -361,7 +363,8 @@ class DataFrameReader private[sql](sparkSession:
SparkSession) extends Logging {
connectionProperties: Properties): DataFrame = {
assertNoSpecifiedSchema("jdbc")
// connectionProperties should override settings in extraOptions.
- val params = extraOptions.toMap ++ connectionProperties.asScala.toMap
+ val params = collection.immutable.ListMap.empty[String, String] ++
+ extraOptions ++ connectionProperties.asScala.toMap
val options = new JDBCOptions(url, table, params)
val parts: Array[Partition] = predicates.zipWithIndex.map { case (part,
i) =>
JDBCPartition(part, i) : Partition
@@ -499,7 +502,7 @@ class DataFrameReader private[sql](sparkSession:
SparkSession) extends Logging {
*/
def json(jsonDataset: Dataset[String]): DataFrame = {
val parsedOptions = new JSONOptions(
- extraOptions.toMap,
+ collection.immutable.ListMap.empty[String, String] ++ extraOptions,
sparkSession.sessionState.conf.sessionLocalTimeZone,
sparkSession.sessionState.conf.columnNameOfCorruptRecord)
@@ -553,7 +556,7 @@ class DataFrameReader private[sql](sparkSession:
SparkSession) extends Logging {
*/
def csv(csvDataset: Dataset[String]): DataFrame = {
val parsedOptions: CSVOptions = new CSVOptions(
- extraOptions.toMap,
+ collection.immutable.ListMap.empty[String, String] ++ extraOptions,
sparkSession.sessionState.conf.csvColumnPruning,
sparkSession.sessionState.conf.sessionLocalTimeZone)
val filteredLines: Dataset[String] =
@@ -879,6 +882,6 @@ class DataFrameReader private[sql](sparkSession:
SparkSession) extends Logging {
private var userSpecifiedSchema: Option[StructType] = None
- private val extraOptions = new scala.collection.mutable.HashMap[String,
String]
+ private val extraOptions = new
scala.collection.mutable.LinkedHashMap[String, String]
}
```
I checked the related codes from `DataFrameReader`, and seems this path
working at this moment.
But there is another problem in my patch: it introduces `ListMap.empty +
map` pattern whereas the signature within Spark is `Map`. This could cause many
corner cases and it's error-prone in the future.
cc @cloud-fan @maropu @viirya. Do you guys have some thoughts on this?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]