Github user yhuai commented on a diff in the pull request:
https://github.com/apache/spark/pull/14102#discussion_r71096584
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
---
@@ -35,184 +34,306 @@ import org.apache.spark.util.Utils
private[json] class SparkSQLJsonProcessingException(msg: String) extends
RuntimeException(msg)
-object JacksonParser extends Logging {
+private[sql] class JacksonParser(schema: StructType, options: JSONOptions)
extends Logging {
+ import com.fasterxml.jackson.core.JsonToken._
- def parse(
- input: RDD[String],
- schema: StructType,
- columnNameOfCorruptRecords: String,
- configOptions: JSONOptions): RDD[InternalRow] = {
+ // A `ValueConverter` is responsible for converting a value from
`JsonParser`
+ // to a value in a field for `InternalRow`.
+ private type ValueConverter = (JsonParser) => Any
- input.mapPartitions { iter =>
- parseJson(iter, schema, columnNameOfCorruptRecords, configOptions)
+ // `ValueConverter`s for the root schema for all fields in the schema
+ private val rootConverter: ValueConverter = makeRootConverter(schema)
+
+ private val factory = new JsonFactory()
+ options.setJacksonOptions(factory)
+
+ private def failedConversion(
+ parser: JsonParser,
+ dataType: DataType): Any = parser.getCurrentToken match {
+ case null | VALUE_NULL =>
+ null
+
+ case _ if parser.getTextLength < 1 =>
+ // guard the non string type
+ null
+
+ case token =>
+ // We cannot parse this token based on the given data type. So, we
throw a
+ // SparkSQLJsonProcessingException and this exception will be caught by
+ // parseJson method.
+ throw new SparkSQLJsonProcessingException(
+ s"Failed to parse a value for data type $dataType (current token:
$token).")
+ }
+
+ private def failedRecord(record: String): Seq[InternalRow] = {
+ // create a row even if no corrupt record column is present
+ if (options.failFast) {
+ throw new RuntimeException(s"Malformed line in FAILFAST mode:
$record")
+ }
+ if (options.dropMalformed) {
+ logWarning(s"Dropping malformed line: $record")
+ Nil
+ } else {
+ val row = new GenericMutableRow(schema.length)
+ for (corruptIndex <-
schema.getFieldIndex(options.columnNameOfCorruptRecord)) {
+ require(schema(corruptIndex).dataType == StringType)
+ row.update(corruptIndex, UTF8String.fromString(record))
+ }
+ Seq(row)
}
}
/**
- * Parse the current token (and related children) according to a desired
schema
- * This is a wrapper for the method `convertField()` to handle a row
wrapped
- * with an array.
+ * Create a converter which converts the JSON documents held by the
`JsonParser`
+ * to a value according to a desired schema. This is a wrapper for the
method
+ * `makeConverter()` to handle a row wrapped with an array.
*/
- def convertRootField(
- factory: JsonFactory,
- parser: JsonParser,
- schema: DataType): Any = {
- import com.fasterxml.jackson.core.JsonToken._
- (parser.getCurrentToken, schema) match {
- case (START_ARRAY, st: StructType) =>
- // SPARK-3308: support reading top level JSON arrays and take
every element
- // in such an array as a row
- convertArray(factory, parser, st)
-
- case (START_OBJECT, ArrayType(st, _)) =>
- // the business end of SPARK-3308:
- // when an object is found but an array is requested just wrap it
in a list
- convertField(factory, parser, st) :: Nil
+ def makeRootConverter(dataType: DataType): ValueConverter = dataType
match {
+ case st: StructType =>
+ // SPARK-3308: support reading top level JSON arrays and take every
element
+ // in such an array as a row
--- End diff --
Seems you want to put this comment at
https://github.com/apache/spark/pull/14102/files#diff-8affe5ec7d691943a88e43eb30af656eR99.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]