This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new dda5bc4680 DRILL-8493: Drill Unable to Read XML Files with Namespaces 
(#2908)
dda5bc4680 is described below

commit dda5bc46808bcc361bd471584d1ee9f04bdc2db6
Author: Charles S. Givre <cgi...@apache.org>
AuthorDate: Sat Apr 27 21:55:29 2024 -0400

    DRILL-8493: Drill Unable to Read XML Files with Namespaces (#2908)
---
 .../org/apache/drill/exec/store/xml/XMLReader.java | 13 +++++--
 .../apache/drill/exec/store/xml/TestXMLReader.java | 24 ++++++++++++
 .../format-xml/src/test/resources/xml/sitemap.xml  | 45 ++++++++++++++++++++++
 3 files changed, 78 insertions(+), 4 deletions(-)

diff --git 
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
 
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
index 1d85851946..d985cd69d8 100644
--- 
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
+++ 
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
@@ -82,6 +82,7 @@ public class XMLReader implements Closeable {
   private XMLEventReader reader;
   private ImplicitColumns metadata;
   private boolean isSelfClosingEvent;
+  private Iterator<Attribute> rootAttributeIterator;
 
   /**
    * This field indicates the various states in which the reader operates. The 
names should be self-explanatory,
@@ -103,6 +104,11 @@ public class XMLReader implements Closeable {
 
     // This property prevents XXE attacks by disallowing DTD.
     inputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
+
+    // When reading some documents with XML Namespaces, Drill seems to ignore 
the rest of the
+    // document. Setting this parameter to false solves this issue.  However, 
when we introduce
+    // XSD support, it will likely be necessary to make this a configurable 
parameter.
+    inputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false);
     reader = inputFactory.createXMLEventReader(fsStream);
     fieldNameStack = new Stack<>();
     rowWriterStack = new Stack<>();
@@ -340,7 +346,6 @@ public class XMLReader implements Closeable {
         // Get the field value
         fieldValue = currentEvent.asCharacters().getData().trim();
         changeState(xmlState.GETTING_DATA);
-        changeState(xmlState.GETTING_DATA);
         break;
 
       case XMLStreamConstants.END_ELEMENT:
@@ -367,11 +372,11 @@ public class XMLReader implements Closeable {
         } else if (currentState == xmlState.FIELD_ENDED && currentNestingLevel 
>= dataLevel) {
           // Case to end nested maps
           // Pop tupleWriter off stack
-          if (rowWriterStack.size() > 0) {
+          if (!rowWriterStack.isEmpty()) {
             currentTupleWriter = rowWriterStack.pop();
           }
           // Pop field name
-          if (fieldNameStack.size() > 0) {
+          if (!fieldNameStack.isEmpty()) {
             fieldNameStack.pop();
           }
 
@@ -385,7 +390,7 @@ public class XMLReader implements Closeable {
           attributePrefix = XMLUtils.removeField(attributePrefix, fieldName);
 
           // Pop field name
-          if (fieldNameStack.size() > 0) {
+          if (!fieldNameStack.isEmpty()) {
             fieldNameStack.pop();
           }
           fieldName = null;
diff --git 
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
 
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
index 1283337410..f6c924f0a7 100644
--- 
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
+++ 
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
@@ -117,6 +117,30 @@ public class TestXMLReader extends ClusterTest {
     new RowSetComparison(expected).verifyAndClearAll(results);
   }
 
+  @Test
+  public void testAttributesOnRootWithNamespace() throws Exception {
+    String sql = "SELECT * FROM table(cp.`xml/sitemap.xml` (type => 'xml', 
dataLevel => 2))";
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+        .add("attributes", MinorType.MAP, DataMode.REQUIRED)
+        .addNullable("loc", MinorType.VARCHAR)
+        .addNullable("lastmod", MinorType.VARCHAR)
+        .addNullable("changefreq", MinorType.VARCHAR)
+        .addNullable("priority", MinorType.VARCHAR)
+        .build();
+
+    RowSet expected = client.rowSetBuilder(expectedSchema)
+        .addRow(mapArray(), 
"https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ1.xml";, 
"2024-03-28T00:10:00.074Z", "monthly", "1.0")
+        .addRow(mapArray(), 
"https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ2.xml";, 
"2023-06-20T23:44:00.215Z", "monthly", "1.0")
+        .addRow(mapArray(), 
"https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ3.xml";, 
"2023-07-03T14:32:01.529Z", "monthly", "1.0")
+        .build();
+
+    assertEquals(3, results.rowCount());
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
+
   @Test
   public void testXXE() throws Exception {
     String sql = "SELECT * FROM cp.`xml/bad.xml`";
diff --git a/contrib/format-xml/src/test/resources/xml/sitemap.xml 
b/contrib/format-xml/src/test/resources/xml/sitemap.xml
new file mode 100644
index 0000000000..1225e6bb5d
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/sitemap.xml
@@ -0,0 +1,45 @@
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+-->
+<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"; 
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd";>
+  <url>
+    <loc>
+https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ1.xml
+</loc>
+    <lastmod>2024-03-28T00:10:00.074Z</lastmod>
+    <changefreq>monthly</changefreq>
+    <priority>1.0</priority>
+  </url>
+  <url>
+    <loc>
+https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ2.xml
+</loc>
+    <lastmod>2023-06-20T23:44:00.215Z</lastmod>
+    <changefreq>monthly</changefreq>
+    <priority>1.0</priority>
+  </url>
+  <url>
+    <loc>
+https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ3.xml
+</loc>
+    <lastmod>2023-07-03T14:32:01.529Z</lastmod>
+    <changefreq>monthly</changefreq>
+    <priority>1.0</priority>
+  </url>
+</urlset>

Reply via email to