Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/20937#discussion_r180098845
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
---
@@ -92,26 +93,30 @@ object TextInputJsonDataSource extends JsonDataSource {
sparkSession: SparkSession,
inputPaths: Seq[FileStatus],
parsedOptions: JSONOptions): StructType = {
- val json: Dataset[String] = createBaseDataset(
- sparkSession, inputPaths, parsedOptions.lineSeparator)
+ val json: Dataset[String] = createBaseDataset(sparkSession,
inputPaths, parsedOptions)
+
inferFromDataset(json, parsedOptions)
}
def inferFromDataset(json: Dataset[String], parsedOptions: JSONOptions):
StructType = {
val sampled: Dataset[String] = JsonUtils.sample(json, parsedOptions)
- val rdd: RDD[UTF8String] =
sampled.queryExecution.toRdd.map(_.getUTF8String(0))
- JsonInferSchema.infer(rdd, parsedOptions,
CreateJacksonParser.utf8String)
+ val rdd: RDD[InternalRow] = sampled.queryExecution.toRdd
+ val rowParser = parsedOptions.encoding.map { enc =>
+ CreateJacksonParser.internalRow(enc, _: JsonFactory, _: InternalRow,
0)
--- End diff --
sorry I think the array is already wrapped by ByteArrayInputStream per
record?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]