This is an automated email from the ASF dual-hosted git repository. cgivre pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push: new dda5bc4680 DRILL-8493: Drill Unable to Read XML Files with Namespaces (#2908) dda5bc4680 is described below commit dda5bc46808bcc361bd471584d1ee9f04bdc2db6 Author: Charles S. Givre <cgi...@apache.org> AuthorDate: Sat Apr 27 21:55:29 2024 -0400 DRILL-8493: Drill Unable to Read XML Files with Namespaces (#2908) --- .../org/apache/drill/exec/store/xml/XMLReader.java | 13 +++++-- .../apache/drill/exec/store/xml/TestXMLReader.java | 24 ++++++++++++ .../format-xml/src/test/resources/xml/sitemap.xml | 45 ++++++++++++++++++++++ 3 files changed, 78 insertions(+), 4 deletions(-) diff --git a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java index 1d85851946..d985cd69d8 100644 --- a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java +++ b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java @@ -82,6 +82,7 @@ public class XMLReader implements Closeable { private XMLEventReader reader; private ImplicitColumns metadata; private boolean isSelfClosingEvent; + private Iterator<Attribute> rootAttributeIterator; /** * This field indicates the various states in which the reader operates. The names should be self-explanatory, @@ -103,6 +104,11 @@ public class XMLReader implements Closeable { // This property prevents XXE attacks by disallowing DTD. inputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false); + + // When reading some documents with XML Namespaces, Drill seems to ignore the rest of the + // document. Setting this parameter to false solves this issue. However, when we introduce + // XSD support, it will likely be necessary to make this a configurable parameter. + inputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); reader = inputFactory.createXMLEventReader(fsStream); fieldNameStack = new Stack<>(); rowWriterStack = new Stack<>(); @@ -340,7 +346,6 @@ public class XMLReader implements Closeable { // Get the field value fieldValue = currentEvent.asCharacters().getData().trim(); changeState(xmlState.GETTING_DATA); - changeState(xmlState.GETTING_DATA); break; case XMLStreamConstants.END_ELEMENT: @@ -367,11 +372,11 @@ public class XMLReader implements Closeable { } else if (currentState == xmlState.FIELD_ENDED && currentNestingLevel >= dataLevel) { // Case to end nested maps // Pop tupleWriter off stack - if (rowWriterStack.size() > 0) { + if (!rowWriterStack.isEmpty()) { currentTupleWriter = rowWriterStack.pop(); } // Pop field name - if (fieldNameStack.size() > 0) { + if (!fieldNameStack.isEmpty()) { fieldNameStack.pop(); } @@ -385,7 +390,7 @@ public class XMLReader implements Closeable { attributePrefix = XMLUtils.removeField(attributePrefix, fieldName); // Pop field name - if (fieldNameStack.size() > 0) { + if (!fieldNameStack.isEmpty()) { fieldNameStack.pop(); } fieldName = null; diff --git a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java index 1283337410..f6c924f0a7 100644 --- a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java +++ b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java @@ -117,6 +117,30 @@ public class TestXMLReader extends ClusterTest { new RowSetComparison(expected).verifyAndClearAll(results); } + @Test + public void testAttributesOnRootWithNamespace() throws Exception { + String sql = "SELECT * FROM table(cp.`xml/sitemap.xml` (type => 'xml', dataLevel => 2))"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + + TupleMetadata expectedSchema = new SchemaBuilder() + .add("attributes", MinorType.MAP, DataMode.REQUIRED) + .addNullable("loc", MinorType.VARCHAR) + .addNullable("lastmod", MinorType.VARCHAR) + .addNullable("changefreq", MinorType.VARCHAR) + .addNullable("priority", MinorType.VARCHAR) + .build(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow(mapArray(), "https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ1.xml", "2024-03-28T00:10:00.074Z", "monthly", "1.0") + .addRow(mapArray(), "https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ2.xml", "2023-06-20T23:44:00.215Z", "monthly", "1.0") + .addRow(mapArray(), "https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ3.xml", "2023-07-03T14:32:01.529Z", "monthly", "1.0") + .build(); + + assertEquals(3, results.rowCount()); + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test public void testXXE() throws Exception { String sql = "SELECT * FROM cp.`xml/bad.xml`"; diff --git a/contrib/format-xml/src/test/resources/xml/sitemap.xml b/contrib/format-xml/src/test/resources/xml/sitemap.xml new file mode 100644 index 0000000000..1225e6bb5d --- /dev/null +++ b/contrib/format-xml/src/test/resources/xml/sitemap.xml @@ -0,0 +1,45 @@ +<!-- + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +--> +<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"> + <url> + <loc> +https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ1.xml +</loc> + <lastmod>2024-03-28T00:10:00.074Z</lastmod> + <changefreq>monthly</changefreq> + <priority>1.0</priority> + </url> + <url> + <loc> +https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ2.xml +</loc> + <lastmod>2023-06-20T23:44:00.215Z</lastmod> + <changefreq>monthly</changefreq> + <priority>1.0</priority> + </url> + <url> + <loc> +https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ3.xml +</loc> + <lastmod>2023-07-03T14:32:01.529Z</lastmod> + <changefreq>monthly</changefreq> + <priority>1.0</priority> + </url> +</urlset>