This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new 129d740  DRILL-7979: Self-Closing XML Tags Cause Schema Change 
Exceptions (#2283)
129d740 is described below

commit 129d740c79bc49367d64de12cda8194bf60a324c
Author: Charles S. Givre <[email protected]>
AuthorDate: Mon Aug 2 23:34:28 2021 -0700

    DRILL-7979: Self-Closing XML Tags Cause Schema Change Exceptions (#2283)
    
    * Initial commit
    
    * WIP
    
    * Unit test working but attribute not popping off stack
    
    * Everything working
    
    * Removed Logback.xml
    
    * Fixed Unit Test
    
    * Fixed corrupt file
    
    * Addressed Review Comments
---
 .../org/apache/drill/exec/store/xml/XMLReader.java | 60 +++++++++++++++++++---
 .../org/apache/drill/exec/store/xml/XMLUtils.java  | 21 ++++++--
 .../apache/drill/exec/store/xml/TestXMLReader.java | 22 ++++++++
 .../apache/drill/exec/store/xml/TestXMLUtils.java  |  2 +-
 .../src/test/resources/xml/schemaChange.xml        | 32 ++++++++++++
 5 files changed, 125 insertions(+), 12 deletions(-)

diff --git 
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
 
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
index 97ce737..dd66e1c 100644
--- 
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
+++ 
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
@@ -72,9 +72,10 @@ public class XMLReader {
   private InputStream fsStream;
   private XMLEventReader reader;
   private ImplicitColumns metadata;
+  private boolean isSelfClosingEvent;
 
   /**
-   * This field indicates the various states in which the reader operates. The 
names should be self explanatory,
+   * This field indicates the various states in which the reader operates. The 
names should be self-explanatory,
    * but they are used as the reader iterates over the XML tags to know what 
to do.
    */
   private enum xmlState {
@@ -96,6 +97,7 @@ public class XMLReader {
     nestedMapCollection = new HashMap<>();
     this.dataLevel = dataLevel;
     this.maxRecords = maxRecords;
+    isSelfClosingEvent = false;
   }
 
   public void open(RowSetLoader rootRowWriter, CustomErrorContext errorContext 
) {
@@ -161,12 +163,18 @@ public class XMLReader {
       try {
         nextEvent = reader.nextEvent();
 
-        // If the next event is whitespace, newlines, or other cruft that we 
don't need
-        // ignore and move to the next event
+        // If the next event is whitespace, newlines, or other cruft that we 
don't need,
+        // ignore the event and move to the next event
         if (XMLUtils.isEmptyWhiteSpace(nextEvent)) {
           continue;
         }
 
+        // Reset the self-closing tag flag.
+        isSelfClosingEvent = isSelfClosingEvent(currentEvent, nextEvent);
+        if (isSelfClosingEvent) {
+          logger.debug("Found self closing event!!");
+        }
+
         // Capture the previous and current event
         XMLEvent lastEvent = currentEvent;
         currentEvent = nextEvent;
@@ -185,6 +193,35 @@ public class XMLReader {
   }
 
   /**
+   * One of the challenges with XML parsing are self-closing events. The 
streaming XML parser
+   * treats self-closing events as two events: a start event and an ending 
event.  The issue is that
+   * the self-closing events can cause schema issues with Drill specifically, 
if a self-closing event
+   * is detected prior to a non-self-closing event, and that populated event 
contains a map or other nested data
+   * Drill will throw a schema change exception.
+   *
+   * Since Drill uses Java's streaming XML parser, unfortunately, it does not 
provide a means of identifying
+   * self-closing tags.  This function does that by comparing the event with 
the previous event and looking for
+   * a condition where one event is a start and the other is an ending event.  
Additionally, the column number and
+   * character offsets must be the same, indicating that the two events are 
the same.
+   *
+   * @param e1 The first XMLEvent
+   * @param e2 The second XMLEvent
+   * @return True if the events represent a self-closing event, false if not.
+   */
+  private boolean isSelfClosingEvent(XMLEvent e1, XMLEvent e2) {
+    // If either event is null return false.
+    if (e1 == null || e2 == null) {
+      return false;
+    } else if (XMLUtils.hasAttributes(e1) || XMLUtils.hasAttributes(e2)) {
+      return false;
+    }
+
+    return (e1.getLocation().getCharacterOffset() == 
e2.getLocation().getCharacterOffset()) &&
+      (e1.getLocation().getColumnNumber() == 
e2.getLocation().getColumnNumber()) &&
+      e1.isStartElement() && e2.isEndElement();
+  }
+
+  /**
    * This function processes an actual XMLEvent. There are three possibilities:
    * 1.  The event is a start event
    * 2.  The event contains text
@@ -233,14 +270,13 @@ public class XMLReader {
         if (!rowStarted) {
           currentTupleWriter = startRow(rootRowWriter);
         } else {
-          if (lastEvent!= null &&
+          if (lastEvent != null &&
             lastEvent.getEventType() == XMLStreamConstants.START_ELEMENT) {
             /*
              * Check the flag in the next section.  If the next element is a 
character AND the flag is set,
              * start a map.  If not... ignore it all.
              */
             changeState(xmlState.POSSIBLE_MAP);
-
             rowWriterStack.push(currentTupleWriter);
           }
 
@@ -292,6 +328,13 @@ public class XMLReader {
       case XMLStreamConstants.END_ELEMENT:
         currentNestingLevel--;
 
+        if (isSelfClosingEvent) {
+          logger.debug("Closing self-closing event {}. ", fieldName);
+          isSelfClosingEvent = false;
+          attributePrefix = XMLUtils.removeField(attributePrefix,fieldName);
+          break;
+        }
+
         if (currentNestingLevel < dataLevel - 1) {
           break;
         } else if 
(currentEvent.asEndElement().getName().toString().compareTo(rootDataFieldName) 
== 0) {
@@ -316,8 +359,10 @@ public class XMLReader {
 
           attributePrefix = XMLUtils.removeField(attributePrefix,fieldName);
 
-        } else if (currentState != xmlState.ROW_ENDED){
-          writeFieldData(fieldName, fieldValue, currentTupleWriter);
+        } else if (currentState != xmlState.ROW_ENDED) {
+          if ( !isSelfClosingEvent) {
+            writeFieldData(fieldName, fieldValue, currentTupleWriter);
+          }
           // Clear out field name and value
           attributePrefix = XMLUtils.removeField(attributePrefix, fieldName);
 
@@ -438,6 +483,7 @@ public class XMLReader {
       // Add map to map collection for future use
       nestedMapCollection.put(tempFieldName, new XMLMap(mapName, 
rowWriter.tuple(index)));
     }
+    logger.debug("Index: {}, Fieldname: {}", index, mapName);
     return rowWriter.tuple(index);
   }
 
diff --git 
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLUtils.java
 
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLUtils.java
index f11b483..dcf9c46 100644
--- 
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLUtils.java
+++ 
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLUtils.java
@@ -82,12 +82,25 @@ public class XMLUtils {
     }
 
     int index = prefix.lastIndexOf(fieldName);
-    if (index == 0) {
+    if (index <= 0) {
       return "";
-    } else if (index < 0) {
-      return prefix;
+    } else {
+      return prefix.substring(0, index - 1);
     }
+  }
 
-    return prefix.substring(0, index-1);
+  /**
+   * Returns true if a given XMLEvent has attributes, false if not. Since only
+   * start elements can by definition have attributes, returns false if the 
event
+   * is not a start element.
+   * @param event The XMLEvent in question
+   * @return True if the XMLEvent has attributes, false if not.
+   */
+  public static boolean hasAttributes(XMLEvent event) {
+    if (! event.isStartElement()) {
+      return false;
+    } else {
+      return event.asStartElement().getAttributes().hasNext();
+    }
   }
 }
diff --git 
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
 
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
index e32a173..f766777 100644
--- 
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
+++ 
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
@@ -543,4 +543,26 @@ public class TestXMLReader extends ClusterTest {
       .include("Limit", "maxRecords=2")
       .match();
   }
+
+  @Test
+  public void testMapError() throws Exception {
+    String sql = "SELECT * FROM cp.`xml/schemaChange.xml`";
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addMap("attributes")
+      .resumeSchema()
+      .addMap("parent")
+        .addNullable("link", MinorType.VARCHAR)
+        .addNullable("value", MinorType.VARCHAR)
+      .resumeSchema()
+      .build();
+
+    RowSet expected = client.rowSetBuilder(expectedSchema)
+      .addRow(mapArray(), mapArray(null, null))
+      .addRow(mapArray(), 
strArray("https://dev57595.service-now.com/api/now/table/task/46eaa7c9a9fe198100bbe282da0d4b7d";,
 "46eaa7c9a9fe198100bbe282da0d4b7d"))
+      .build();
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
 }
diff --git 
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLUtils.java
 
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLUtils.java
index 06d70ee..5588577 100644
--- 
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLUtils.java
+++ 
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLUtils.java
@@ -35,7 +35,7 @@ public class TestXMLUtils {
 
     // Test with missing field
     String test3 = "field_1_field_2_field_3";
-    assertEquals(XMLUtils.removeField(test3, "field_4"), 
"field_1_field_2_field_3");
+    assertEquals(XMLUtils.removeField(test3, "field_4"), "");
 
     // Test with empty string
     String test4 = "";
diff --git a/contrib/format-xml/src/test/resources/xml/schemaChange.xml 
b/contrib/format-xml/src/test/resources/xml/schemaChange.xml
new file mode 100644
index 0000000..40ee83d
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/schemaChange.xml
@@ -0,0 +1,32 @@
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+-->
+<response>
+  <result>
+    <parent/>
+    <reason/>
+  </result>
+  <result>
+    <parent>
+      
<link>https://dev57595.service-now.com/api/now/table/task/46eaa7c9a9fe198100bbe282da0d4b7d</link>
+      <value>46eaa7c9a9fe198100bbe282da0d4b7d</value>
+    </parent>
+    <reason/>
+  </result>
+</response>
\ No newline at end of file

Reply via email to