sandip-db commented on code in PR #44571:
URL: https://github.com/apache/spark/pull/44571#discussion_r1442428458
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -208,56 +202,36 @@ class StaxXmlParser(
(parser.peek, dataType) match {
case (_: StartElement, dt: DataType) => convertComplicatedType(dt,
attributes)
case (_: EndElement, _: StringType) =>
+ StaxXmlParserUtils.consumeNextEndElement(parser)
Review Comment:
```suggestion
parser.next
```
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -208,56 +202,36 @@ class StaxXmlParser(
(parser.peek, dataType) match {
case (_: StartElement, dt: DataType) => convertComplicatedType(dt,
attributes)
case (_: EndElement, _: StringType) =>
+ StaxXmlParserUtils.consumeNextEndElement(parser)
// Empty. It's null if "" is the null value
if (options.nullValue == "") {
null
} else {
UTF8String.fromString("")
}
- case (_: EndElement, _: DataType) => null
+ case (_: EndElement, _: DataType) =>
+ StaxXmlParserUtils.consumeNextEndElement(parser)
+ null
case (c: Characters, ArrayType(st, _)) =>
// For `ArrayType`, it needs to return the type of element. The values
are merged later.
parser.next
- convertTo(c.getData, st)
- case (c: Characters, st: StructType) =>
- parser.next
- parser.peek match {
- case _: EndElement =>
- // It couldn't be an array of value tags
- // as the opening tag is immediately followed by a closing tag.
- if (c.isWhiteSpace) {
- return null
- }
- val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
- indexOpt match {
- case Some(index) =>
- convertTo(c.getData, st.fields(index).dataType)
- case None => null
- }
- case _ =>
- val row = convertObject(parser, st)
- if (!c.isWhiteSpace) {
- addOrUpdate(row.toSeq(st).toArray, st, options.valueTag,
c.getData, addToTail = false)
- } else {
- row
- }
- }
+ val value = convertTo(c.getData, st)
+ StaxXmlParserUtils.consumeNextEndElement(parser)
Review Comment:
The check in this function will fail if the next event is Comment, Cdata,
etc.
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -423,19 +396,22 @@ class StaxXmlParser(
}
} else {
StaxXmlParserUtils.skipChildren(parser)
+ StaxXmlParserUtils.consumeNextEndElement(parser)
Review Comment:
Why not skip EndElement in skipChildren itself?
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala:
##########
@@ -151,6 +151,7 @@ object StaxXmlParserUtils {
indent > 0
case _ => true
})
+ consumeNextEndElement(parser)
Review Comment:
```suggestion
if (indent <= 0) parser.next
indent > 0
case _ => true
})
```
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -208,56 +202,36 @@ class StaxXmlParser(
(parser.peek, dataType) match {
case (_: StartElement, dt: DataType) => convertComplicatedType(dt,
attributes)
case (_: EndElement, _: StringType) =>
+ StaxXmlParserUtils.consumeNextEndElement(parser)
// Empty. It's null if "" is the null value
if (options.nullValue == "") {
null
} else {
UTF8String.fromString("")
}
- case (_: EndElement, _: DataType) => null
+ case (_: EndElement, _: DataType) =>
+ StaxXmlParserUtils.consumeNextEndElement(parser)
Review Comment:
```suggestion
parser.next
```
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -208,56 +202,36 @@ class StaxXmlParser(
(parser.peek, dataType) match {
case (_: StartElement, dt: DataType) => convertComplicatedType(dt,
attributes)
case (_: EndElement, _: StringType) =>
+ StaxXmlParserUtils.consumeNextEndElement(parser)
// Empty. It's null if "" is the null value
if (options.nullValue == "") {
null
} else {
UTF8String.fromString("")
}
- case (_: EndElement, _: DataType) => null
+ case (_: EndElement, _: DataType) =>
+ StaxXmlParserUtils.consumeNextEndElement(parser)
+ null
case (c: Characters, ArrayType(st, _)) =>
// For `ArrayType`, it needs to return the type of element. The values
are merged later.
parser.next
- convertTo(c.getData, st)
- case (c: Characters, st: StructType) =>
- parser.next
- parser.peek match {
- case _: EndElement =>
- // It couldn't be an array of value tags
- // as the opening tag is immediately followed by a closing tag.
- if (c.isWhiteSpace) {
- return null
- }
- val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
- indexOpt match {
- case Some(index) =>
- convertTo(c.getData, st.fields(index).dataType)
- case None => null
- }
- case _ =>
- val row = convertObject(parser, st)
- if (!c.isWhiteSpace) {
- addOrUpdate(row.toSeq(st).toArray, st, options.valueTag,
c.getData, addToTail = false)
- } else {
- row
- }
- }
+ val value = convertTo(c.getData, st)
+ StaxXmlParserUtils.consumeNextEndElement(parser)
+ value
+ case (_: Characters, st: StructType) =>
+ convertObject(parser, st)
case (_: Characters, _: StringType) =>
convertTo(StaxXmlParserUtils.currentStructureAsString(parser),
StringType)
case (c: Characters, _: DataType) if c.isWhiteSpace =>
// When `Characters` is found, we need to look further to decide
// if this is really data or space between other elements.
- val data = c.getData
parser.next
- parser.peek match {
- case _: StartElement => convertComplicatedType(dataType, attributes)
- case _: EndElement if data.isEmpty => null
- case _: EndElement => convertTo(data, dataType)
- case _ => convertField(parser, dataType, attributes)
- }
+ convertField(parser, dataType, attributes)
case (c: Characters, dt: DataType) =>
+ val value = convertTo(c.getData, dt)
parser.next
- convertTo(c.getData, dt)
+ StaxXmlParserUtils.consumeNextEndElement(parser)
Review Comment:
The check in this function will fail if the next event is Comment, Cdata,
etc.
`<ROW><foo attr="a">1<!--This is a comment--></foo></ROW>`
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -423,19 +396,22 @@ class StaxXmlParser(
}
} else {
StaxXmlParserUtils.skipChildren(parser)
+ StaxXmlParserUtils.consumeNextEndElement(parser)
}
}
} catch {
case e: SparkUpgradeException => throw e
case NonFatal(e) =>
badRecordException = badRecordException.orElse(Some(e))
+ StaxXmlParserUtils.skipChildren(parser)
+ StaxXmlParserUtils.consumeNextEndElement(parser)
Review Comment:
This change is not needed.
This may be a parsing exception. Parsing any further will likely result in
more exception. And `badRecordException ` will be lost.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]