shujingyang-db commented on code in PR #44318:
URL: https://github.com/apache/spark/pull/44318#discussion_r1434536452
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -201,27 +217,30 @@ class StaxXmlParser(
case (_: EndElement, _: DataType) => null
case (c: Characters, ArrayType(st, _)) =>
// For `ArrayType`, it needs to return the type of element. The values
are merged later.
+ parser.next
convertTo(c.getData, st)
case (c: Characters, st: StructType) =>
- // If a value tag is present, this can be an attribute-only element
whose values is in that
- // value tag field. Or, it can be a mixed-type element with both some
character elements
- // and other complex structure. Character elements are ignored.
- val attributesOnly = st.fields.forall { f =>
- f.name == options.valueTag ||
f.name.startsWith(options.attributePrefix)
- }
- if (attributesOnly) {
- // If everything else is an attribute column, there's no complex
structure.
- // Just return the value of the character element, or null if we
don't have a value tag
- st.find(_.name == options.valueTag).map(
- valueTag => convertTo(c.getData, valueTag.dataType)).orNull
- } else {
- // Otherwise, ignore this character element, and continue parsing
the following complex
- // structure
- parser.next
- parser.peek match {
- case _: EndElement => null // no struct here at all; done
- case _ => convertObject(parser, st)
- }
+ parser.next
+ parser.peek match {
+ case _: EndElement =>
+ // It couldn't be an array of value tags
+ // as the opening tag is immediately followed by a closing tag.
+ if (c.isWhiteSpace) {
+ return null
+ }
+ val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
+ indexOpt match {
+ case Some(index) =>
+ convertTo(c.getData, st.fields(index).dataType)
+ case None => null
+ }
+ case _ =>
+ val row = convertObject(parser, st)
+ if (!c.isWhiteSpace) {
+ addOrUpdate(row.toSeq(st).toArray, st, options.valueTag,
c.getData, addToTail = false)
Review Comment:
This is because in this case, we encounter the interspersed value first and
then the nested objects. We want to make sure that the value tag appears before
the nested objects
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]