This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new 129d740 DRILL-7979: Self-Closing XML Tags Cause Schema Change
Exceptions (#2283)
129d740 is described below
commit 129d740c79bc49367d64de12cda8194bf60a324c
Author: Charles S. Givre <[email protected]>
AuthorDate: Mon Aug 2 23:34:28 2021 -0700
DRILL-7979: Self-Closing XML Tags Cause Schema Change Exceptions (#2283)
* Initial commit
* WIP
* Unit test working but attribute not popping off stack
* Everything working
* Removed Logback.xml
* Fixed Unit Test
* Fixed corrupt file
* Addressed Review Comments
---
.../org/apache/drill/exec/store/xml/XMLReader.java | 60 +++++++++++++++++++---
.../org/apache/drill/exec/store/xml/XMLUtils.java | 21 ++++++--
.../apache/drill/exec/store/xml/TestXMLReader.java | 22 ++++++++
.../apache/drill/exec/store/xml/TestXMLUtils.java | 2 +-
.../src/test/resources/xml/schemaChange.xml | 32 ++++++++++++
5 files changed, 125 insertions(+), 12 deletions(-)
diff --git
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
index 97ce737..dd66e1c 100644
---
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
+++
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
@@ -72,9 +72,10 @@ public class XMLReader {
private InputStream fsStream;
private XMLEventReader reader;
private ImplicitColumns metadata;
+ private boolean isSelfClosingEvent;
/**
- * This field indicates the various states in which the reader operates. The
names should be self explanatory,
+ * This field indicates the various states in which the reader operates. The
names should be self-explanatory,
* but they are used as the reader iterates over the XML tags to know what
to do.
*/
private enum xmlState {
@@ -96,6 +97,7 @@ public class XMLReader {
nestedMapCollection = new HashMap<>();
this.dataLevel = dataLevel;
this.maxRecords = maxRecords;
+ isSelfClosingEvent = false;
}
public void open(RowSetLoader rootRowWriter, CustomErrorContext errorContext
) {
@@ -161,12 +163,18 @@ public class XMLReader {
try {
nextEvent = reader.nextEvent();
- // If the next event is whitespace, newlines, or other cruft that we
don't need
- // ignore and move to the next event
+ // If the next event is whitespace, newlines, or other cruft that we
don't need,
+ // ignore the event and move to the next event
if (XMLUtils.isEmptyWhiteSpace(nextEvent)) {
continue;
}
+ // Reset the self-closing tag flag.
+ isSelfClosingEvent = isSelfClosingEvent(currentEvent, nextEvent);
+ if (isSelfClosingEvent) {
+ logger.debug("Found self closing event!!");
+ }
+
// Capture the previous and current event
XMLEvent lastEvent = currentEvent;
currentEvent = nextEvent;
@@ -185,6 +193,35 @@ public class XMLReader {
}
/**
+ * One of the challenges with XML parsing are self-closing events. The
streaming XML parser
+ * treats self-closing events as two events: a start event and an ending
event. The issue is that
+ * the self-closing events can cause schema issues with Drill specifically,
if a self-closing event
+ * is detected prior to a non-self-closing event, and that populated event
contains a map or other nested data
+ * Drill will throw a schema change exception.
+ *
+ * Since Drill uses Java's streaming XML parser, unfortunately, it does not
provide a means of identifying
+ * self-closing tags. This function does that by comparing the event with
the previous event and looking for
+ * a condition where one event is a start and the other is an ending event.
Additionally, the column number and
+ * character offsets must be the same, indicating that the two events are
the same.
+ *
+ * @param e1 The first XMLEvent
+ * @param e2 The second XMLEvent
+ * @return True if the events represent a self-closing event, false if not.
+ */
+ private boolean isSelfClosingEvent(XMLEvent e1, XMLEvent e2) {
+ // If either event is null return false.
+ if (e1 == null || e2 == null) {
+ return false;
+ } else if (XMLUtils.hasAttributes(e1) || XMLUtils.hasAttributes(e2)) {
+ return false;
+ }
+
+ return (e1.getLocation().getCharacterOffset() ==
e2.getLocation().getCharacterOffset()) &&
+ (e1.getLocation().getColumnNumber() ==
e2.getLocation().getColumnNumber()) &&
+ e1.isStartElement() && e2.isEndElement();
+ }
+
+ /**
* This function processes an actual XMLEvent. There are three possibilities:
* 1. The event is a start event
* 2. The event contains text
@@ -233,14 +270,13 @@ public class XMLReader {
if (!rowStarted) {
currentTupleWriter = startRow(rootRowWriter);
} else {
- if (lastEvent!= null &&
+ if (lastEvent != null &&
lastEvent.getEventType() == XMLStreamConstants.START_ELEMENT) {
/*
* Check the flag in the next section. If the next element is a
character AND the flag is set,
* start a map. If not... ignore it all.
*/
changeState(xmlState.POSSIBLE_MAP);
-
rowWriterStack.push(currentTupleWriter);
}
@@ -292,6 +328,13 @@ public class XMLReader {
case XMLStreamConstants.END_ELEMENT:
currentNestingLevel--;
+ if (isSelfClosingEvent) {
+ logger.debug("Closing self-closing event {}. ", fieldName);
+ isSelfClosingEvent = false;
+ attributePrefix = XMLUtils.removeField(attributePrefix,fieldName);
+ break;
+ }
+
if (currentNestingLevel < dataLevel - 1) {
break;
} else if
(currentEvent.asEndElement().getName().toString().compareTo(rootDataFieldName)
== 0) {
@@ -316,8 +359,10 @@ public class XMLReader {
attributePrefix = XMLUtils.removeField(attributePrefix,fieldName);
- } else if (currentState != xmlState.ROW_ENDED){
- writeFieldData(fieldName, fieldValue, currentTupleWriter);
+ } else if (currentState != xmlState.ROW_ENDED) {
+ if ( !isSelfClosingEvent) {
+ writeFieldData(fieldName, fieldValue, currentTupleWriter);
+ }
// Clear out field name and value
attributePrefix = XMLUtils.removeField(attributePrefix, fieldName);
@@ -438,6 +483,7 @@ public class XMLReader {
// Add map to map collection for future use
nestedMapCollection.put(tempFieldName, new XMLMap(mapName,
rowWriter.tuple(index)));
}
+ logger.debug("Index: {}, Fieldname: {}", index, mapName);
return rowWriter.tuple(index);
}
diff --git
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLUtils.java
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLUtils.java
index f11b483..dcf9c46 100644
---
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLUtils.java
+++
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLUtils.java
@@ -82,12 +82,25 @@ public class XMLUtils {
}
int index = prefix.lastIndexOf(fieldName);
- if (index == 0) {
+ if (index <= 0) {
return "";
- } else if (index < 0) {
- return prefix;
+ } else {
+ return prefix.substring(0, index - 1);
}
+ }
- return prefix.substring(0, index-1);
+ /**
+ * Returns true if a given XMLEvent has attributes, false if not. Since only
+ * start elements can by definition have attributes, returns false if the
event
+ * is not a start element.
+ * @param event The XMLEvent in question
+ * @return True if the XMLEvent has attributes, false if not.
+ */
+ public static boolean hasAttributes(XMLEvent event) {
+ if (! event.isStartElement()) {
+ return false;
+ } else {
+ return event.asStartElement().getAttributes().hasNext();
+ }
}
}
diff --git
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
index e32a173..f766777 100644
---
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
+++
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
@@ -543,4 +543,26 @@ public class TestXMLReader extends ClusterTest {
.include("Limit", "maxRecords=2")
.match();
}
+
+ @Test
+ public void testMapError() throws Exception {
+ String sql = "SELECT * FROM cp.`xml/schemaChange.xml`";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addMap("attributes")
+ .resumeSchema()
+ .addMap("parent")
+ .addNullable("link", MinorType.VARCHAR)
+ .addNullable("value", MinorType.VARCHAR)
+ .resumeSchema()
+ .build();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow(mapArray(), mapArray(null, null))
+ .addRow(mapArray(),
strArray("https://dev57595.service-now.com/api/now/table/task/46eaa7c9a9fe198100bbe282da0d4b7d",
"46eaa7c9a9fe198100bbe282da0d4b7d"))
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
}
diff --git
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLUtils.java
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLUtils.java
index 06d70ee..5588577 100644
---
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLUtils.java
+++
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLUtils.java
@@ -35,7 +35,7 @@ public class TestXMLUtils {
// Test with missing field
String test3 = "field_1_field_2_field_3";
- assertEquals(XMLUtils.removeField(test3, "field_4"),
"field_1_field_2_field_3");
+ assertEquals(XMLUtils.removeField(test3, "field_4"), "");
// Test with empty string
String test4 = "";
diff --git a/contrib/format-xml/src/test/resources/xml/schemaChange.xml
b/contrib/format-xml/src/test/resources/xml/schemaChange.xml
new file mode 100644
index 0000000..40ee83d
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/schemaChange.xml
@@ -0,0 +1,32 @@
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<response>
+ <result>
+ <parent/>
+ <reason/>
+ </result>
+ <result>
+ <parent>
+
<link>https://dev57595.service-now.com/api/now/table/task/46eaa7c9a9fe198100bbe282da0d4b7d</link>
+ <value>46eaa7c9a9fe198100bbe282da0d4b7d</value>
+ </parent>
+ <reason/>
+ </result>
+</response>
\ No newline at end of file