Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/22152#discussion_r211607005
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
---
@@ -69,10 +70,17 @@ private[sql] object JsonInferSchema {
}.reduceOption(typeMerger).toIterator
}
- // Here we get RDD local iterator then fold, instead of calling
`RDD.fold` directly, because
- // `RDD.fold` will run the fold function in DAGScheduler event loop
thread, which may not have
- // active SparkSession and `SQLConf.get` may point to the wrong
configs.
- val rootType =
mergedTypesFromPartitions.toLocalIterator.fold(StructType(Nil))(typeMerger)
+ // Here we manually submit a fold-like Spark job, so that we can set
the SQLConf when running
+ // the fold functions in the scheduler event loop thread.
+ val existingConf = SQLConf.get
+ var rootType: DataType = StructType(Nil)
+ val foldPartition = (iter: Iterator[DataType]) =>
iter.fold(StructType(Nil))(typeMerger)
+ val mergeResult = (index: Int, taskResult: DataType) => {
+ rootType = SQLConf.withExistingConf(existingConf) {
--- End diff --
ah good point!
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]