sandip-db commented on code in PR #44571:
URL: https://github.com/apache/spark/pull/44571#discussion_r1442428458


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -208,56 +202,36 @@ class StaxXmlParser(
     (parser.peek, dataType) match {
       case (_: StartElement, dt: DataType) => convertComplicatedType(dt, 
attributes)
       case (_: EndElement, _: StringType) =>
+        StaxXmlParserUtils.consumeNextEndElement(parser)

Review Comment:
   ```suggestion
           parser.next
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -208,56 +202,36 @@ class StaxXmlParser(
     (parser.peek, dataType) match {
       case (_: StartElement, dt: DataType) => convertComplicatedType(dt, 
attributes)
       case (_: EndElement, _: StringType) =>
+        StaxXmlParserUtils.consumeNextEndElement(parser)
         // Empty. It's null if "" is the null value
         if (options.nullValue == "") {
           null
         } else {
           UTF8String.fromString("")
         }
-      case (_: EndElement, _: DataType) => null
+      case (_: EndElement, _: DataType) =>
+        StaxXmlParserUtils.consumeNextEndElement(parser)
+        null
       case (c: Characters, ArrayType(st, _)) =>
         // For `ArrayType`, it needs to return the type of element. The values 
are merged later.
         parser.next
-        convertTo(c.getData, st)
-      case (c: Characters, st: StructType) =>
-        parser.next
-        parser.peek match {
-          case _: EndElement =>
-            // It couldn't be an array of value tags
-            // as the opening tag is immediately followed by a closing tag.
-            if (c.isWhiteSpace) {
-              return null
-            }
-            val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
-            indexOpt match {
-              case Some(index) =>
-                convertTo(c.getData, st.fields(index).dataType)
-              case None => null
-            }
-          case _ =>
-            val row = convertObject(parser, st)
-            if (!c.isWhiteSpace) {
-              addOrUpdate(row.toSeq(st).toArray, st, options.valueTag, 
c.getData, addToTail = false)
-            } else {
-              row
-            }
-        }
+        val value = convertTo(c.getData, st)
+        StaxXmlParserUtils.consumeNextEndElement(parser)

Review Comment:
   The check in this function will fail if the next event is Comment, Cdata, 
etc.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -423,19 +396,22 @@ class StaxXmlParser(
                 }
               } else {
                 StaxXmlParserUtils.skipChildren(parser)
+                StaxXmlParserUtils.consumeNextEndElement(parser)

Review Comment:
   Why not skip EndElement in skipChildren itself?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala:
##########
@@ -151,6 +151,7 @@ object StaxXmlParserUtils {
         indent > 0
       case _ => true
     })
+    consumeNextEndElement(parser)

Review Comment:
   ```suggestion
           if (indent <= 0) parser.next
           indent > 0
         case _ => true
       })
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -208,56 +202,36 @@ class StaxXmlParser(
     (parser.peek, dataType) match {
       case (_: StartElement, dt: DataType) => convertComplicatedType(dt, 
attributes)
       case (_: EndElement, _: StringType) =>
+        StaxXmlParserUtils.consumeNextEndElement(parser)
         // Empty. It's null if "" is the null value
         if (options.nullValue == "") {
           null
         } else {
           UTF8String.fromString("")
         }
-      case (_: EndElement, _: DataType) => null
+      case (_: EndElement, _: DataType) =>
+        StaxXmlParserUtils.consumeNextEndElement(parser)

Review Comment:
   ```suggestion
           parser.next
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -208,56 +202,36 @@ class StaxXmlParser(
     (parser.peek, dataType) match {
       case (_: StartElement, dt: DataType) => convertComplicatedType(dt, 
attributes)
       case (_: EndElement, _: StringType) =>
+        StaxXmlParserUtils.consumeNextEndElement(parser)
         // Empty. It's null if "" is the null value
         if (options.nullValue == "") {
           null
         } else {
           UTF8String.fromString("")
         }
-      case (_: EndElement, _: DataType) => null
+      case (_: EndElement, _: DataType) =>
+        StaxXmlParserUtils.consumeNextEndElement(parser)
+        null
       case (c: Characters, ArrayType(st, _)) =>
         // For `ArrayType`, it needs to return the type of element. The values 
are merged later.
         parser.next
-        convertTo(c.getData, st)
-      case (c: Characters, st: StructType) =>
-        parser.next
-        parser.peek match {
-          case _: EndElement =>
-            // It couldn't be an array of value tags
-            // as the opening tag is immediately followed by a closing tag.
-            if (c.isWhiteSpace) {
-              return null
-            }
-            val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
-            indexOpt match {
-              case Some(index) =>
-                convertTo(c.getData, st.fields(index).dataType)
-              case None => null
-            }
-          case _ =>
-            val row = convertObject(parser, st)
-            if (!c.isWhiteSpace) {
-              addOrUpdate(row.toSeq(st).toArray, st, options.valueTag, 
c.getData, addToTail = false)
-            } else {
-              row
-            }
-        }
+        val value = convertTo(c.getData, st)
+        StaxXmlParserUtils.consumeNextEndElement(parser)
+        value
+      case (_: Characters, st: StructType) =>
+        convertObject(parser, st)
       case (_: Characters, _: StringType) =>
         convertTo(StaxXmlParserUtils.currentStructureAsString(parser), 
StringType)
       case (c: Characters, _: DataType) if c.isWhiteSpace =>
         // When `Characters` is found, we need to look further to decide
         // if this is really data or space between other elements.
-        val data = c.getData
         parser.next
-        parser.peek match {
-          case _: StartElement => convertComplicatedType(dataType, attributes)
-          case _: EndElement if data.isEmpty => null
-          case _: EndElement => convertTo(data, dataType)
-          case _ => convertField(parser, dataType, attributes)
-        }
+        convertField(parser, dataType, attributes)
       case (c: Characters, dt: DataType) =>
+        val value = convertTo(c.getData, dt)
         parser.next
-        convertTo(c.getData, dt)
+        StaxXmlParserUtils.consumeNextEndElement(parser)

Review Comment:
   The check in this function will fail if the next event is Comment, Cdata, 
etc.
   `<ROW><foo attr="a">1<!--This is a comment--></foo></ROW>`



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -423,19 +396,22 @@ class StaxXmlParser(
                 }
               } else {
                 StaxXmlParserUtils.skipChildren(parser)
+                StaxXmlParserUtils.consumeNextEndElement(parser)
               }
           }
         } catch {
           case e: SparkUpgradeException => throw e
           case NonFatal(e) =>
             badRecordException = badRecordException.orElse(Some(e))
+            StaxXmlParserUtils.skipChildren(parser)
+            StaxXmlParserUtils.consumeNextEndElement(parser)

Review Comment:
   This change is not needed. 
   This may be a parsing exception. Parsing any further will likely result in 
more exception. And `badRecordException ` will be lost.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to