Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/20937#discussion_r178476931
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
---
@@ -92,32 +93,34 @@ object TextInputJsonDataSource extends JsonDataSource {
sparkSession: SparkSession,
inputPaths: Seq[FileStatus],
parsedOptions: JSONOptions): StructType = {
- val json: Dataset[String] = createBaseDataset(
- sparkSession, inputPaths, parsedOptions.lineSeparator)
+ val json: Dataset[String] = createBaseDataset(sparkSession,
inputPaths, parsedOptions)
+
inferFromDataset(json, parsedOptions)
}
def inferFromDataset(json: Dataset[String], parsedOptions: JSONOptions):
StructType = {
val sampled: Dataset[String] = JsonUtils.sample(json, parsedOptions)
- val rdd: RDD[UTF8String] =
sampled.queryExecution.toRdd.map(_.getUTF8String(0))
- JsonInferSchema.infer(rdd, parsedOptions,
CreateJacksonParser.utf8String)
+ val rdd: RDD[InternalRow] = sampled.queryExecution.toRdd
+
+ JsonInferSchema.infer[InternalRow](
+ rdd,
+ parsedOptions,
+ CreateJacksonParser.internalRow(_, _, 0, parsedOptions.encoding)
+ )
}
private def createBaseDataset(
sparkSession: SparkSession,
inputPaths: Seq[FileStatus],
- lineSeparator: Option[String]): Dataset[String] = {
- val textOptions = lineSeparator.map { lineSep =>
- Map(TextOptions.LINE_SEPARATOR -> lineSep)
- }.getOrElse(Map.empty[String, String])
-
+ parsedOptions: JSONOptions
+ ): Dataset[String] = {
--- End diff --
Let's move this line up too. I think it should be either
```
...
parsedOptions: JSONOptions)
: Dataset[String] = {
```
or
```
parsedOptions: JSONOptions) : Dataset[String] = {
```
https://github.com/databricks/scala-style-guide#spacing-and-indentation
but somehow I believe we usually do the latter style.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]