This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new 8f892b3 DRILL-7823 - Add XML Format Plugin
8f892b3 is described below
commit 8f892b3c9b04e5e0ff1973681ff862da857d22ef
Author: Charles Givre <[email protected]>
AuthorDate: Mon Dec 21 13:33:11 2020 -0500
DRILL-7823 - Add XML Format Plugin
---
contrib/format-xml/README.md | 136 +++++++
contrib/format-xml/pom.xml | 86 +++++
.../drill/exec/store/xml/XMLBatchReader.java | 100 +++++
.../drill/exec/store/xml/XMLFormatConfig.java | 79 ++++
.../drill/exec/store/xml/XMLFormatPlugin.java | 93 +++++
.../org/apache/drill/exec/store/xml/XMLMap.java | 63 +++
.../org/apache/drill/exec/store/xml/XMLReader.java | 416 ++++++++++++++++++++
.../org/apache/drill/exec/store/xml/XMLUtils.java | 92 +++++
.../main/resources/bootstrap-format-plugins.json | 26 ++
.../src/main/resources/drill-module.conf | 25 ++
.../apache/drill/exec/store/xml/TestXMLReader.java | 428 +++++++++++++++++++++
.../src/test/resources/xml/attributes.xml | 42 ++
.../src/test/resources/xml/deep-nested.xml | 60 +++
.../src/test/resources/xml/deep-nested2.xml | 60 +++
.../format-xml/src/test/resources/xml/nested.xml | 63 +++
.../test/resources/xml/really-simple-nested.xml | 39 ++
.../format-xml/src/test/resources/xml/simple.xml | 42 ++
.../src/test/resources/xml/simple_schema.xsd | 43 +++
.../src/test/resources/xml/very-nested.xml | 38 ++
.../native/client/src/protobuf/UserBitShared.pb.cc | 15 +-
.../native/client/src/protobuf/UserBitShared.pb.h | 5 +-
contrib/pom.xml | 1 +
distribution/pom.xml | 5 +
distribution/src/assemble/component.xml | 1 +
.../org/apache/drill/exec/proto/UserBitShared.java | 21 +-
protocol/src/main/protobuf/UserBitShared.proto | 1 +
26 files changed, 1965 insertions(+), 15 deletions(-)
diff --git a/contrib/format-xml/README.md b/contrib/format-xml/README.md
new file mode 100644
index 0000000..dc245a3
--- /dev/null
+++ b/contrib/format-xml/README.md
@@ -0,0 +1,136 @@
+# XML Format Reader
+This plugin enables Drill to read XML files without defining any kind of
schema.
+
+## Configuration
+Aside from the file extension, there is one configuration option:
+
+* `dataLevel`: XML data often contains a considerable amount of nesting which
is not necesarily useful for data analysis. This parameter allows you to set
the nesting level
+ where the data actually starts. The levels start at `1`.
+
+The default configuration is shown below:
+
+```json
+"xml": {
+ "type": "xml",
+ "extensions": [
+ "xml"
+ ],
+ "dataLevel": 2
+}
+```
+
+## Data Types
+All fields are read as strings. Nested fields are read as maps. Future
functionality could include support for lists.
+
+### Attributes
+XML events can have attributes which can also be useful.
+```xml
+<book>
+ <author>O.-J. Dahl</author>
+ <title binding="hardcover" subcategory="non-fiction">Structured
Programming</title>
+ <category>PROGRAMMING</category>
+ <year>1972</year>
+</book>
+```
+
+In the example above, the `title` field contains two attributes, the `binding`
and `subcategory`. In order to access these fields, Drill creates a map called
`attributes` and
+adds an entry for each attribute with the field name and then the attribute
name. Every XML file will have a field called `atttributes` regardless of
whether the data actually
+has attributes or not.
+
+```xml
+<books>
+ <book>
+ <author>Mark Twain</author>
+ <title>The Adventures of Tom Sawyer</title>
+ <category>FICTION</category>
+ <year>1876</year>
+ </book>
+ <book>
+ <authors>
+ <author>Niklaus Wirth</author>
+ <author>Somebody else</author>
+ </authors>
+ <title binding="paperback">The Programming Language Pascal</title>
+ <category >PASCAL</category>
+ <year>1971</year>
+ </book>
+ <book>
+ <author>O.-J. Dahl</author>
+ <title binding="hardcover" subcategory="non-fiction">Structured
Programming</title>
+ <category>PROGRAMMING</category>
+ <year>1972</year>
+ </book>
+ </books>
+```
+If you queried this data in Drill you'd get the table below:
+
+```sql
+SELECT *
+FROM <path>.`attributes.xml`
+```
+
+```
+apache drill> select * from dfs.test.`attributes.xml`;
++-----------------------------------------------------------------+------------+---------------------------------+-------------+------+-----------------------------------------+
+| attributes | author
| title | category | year |
authors |
++-----------------------------------------------------------------+------------+---------------------------------+-------------+------+-----------------------------------------+
+| {} | Mark Twain
| The Adventures of Tom Sawyer | FICTION | 1876 | {}
|
+| {"title_binding":"paperback"} | null
| The Programming Language Pascal | PASCAL | 1971 | {"author":"Niklaus
WirthSomebody else"} |
+| {"title_binding":"hardcover","title_subcategory":"non-fiction"} | O.-J. Dahl
| Structured Programming | PROGRAMMING | 1972 | {}
|
++-----------------------------------------------------------------+------------+---------------------------------+-------------+------+-----------------------------------------+
+```
+
+
+
+## Limitations: Schema Ambiguity
+XML is a challenging format to process as the structure does not give any
hints about the schema. For example, a JSON file might have the following
record:
+
+```json
+"record" : {
+ "intField:" : 1,
+ "listField" : [1, 2],
+ "otherField" : {
+ "nestedField1" : "foo",
+ "nestedField2" : "bar"
+ }
+}
+```
+
+From this data, it is clear that `listField` is a `list` and `otherField` is a
map. This same data could be represented in XML as follows:
+
+```xml
+<record>
+ <intField>1</intField>
+ <listField>
+ <value>1</value>
+ <value>2</value>
+ </listField>
+ <otherField>
+ <nestedField1>foo</nestedField1>
+ <nestedField2>bar</nestedField2>
+ </otherField>
+</record>
+```
+This is no problem to parse this data. But consider what would happen if we
encountered the following first:
+```xml
+<record>
+ <intField>1</intField>
+ <listField>
+ <value>2</value>
+ </listField>
+ <otherField>
+ <nestedField1>foo</nestedField1>
+ <nestedField2>bar</nestedField2>
+ </otherField>
+</record>
+```
+In this example, there is no way for Drill to know whether `listField` is a
`list` or a `map` because it only has one entry.
+
+## Future Functionality
+
+* **Build schema from XSD file or link**: One of the major challenges of this
reader is having to infer the schema of the data. XML files do provide a schema
although this is not
+ required. In the future, if there is interest, we can extend this reader to
use an XSD file to build the schema which will be used to parse the actual XML
file.
+
+* **Infer Date Fields**: It may be possible to add the ability to infer data
fields.
+
+* **List Support**: Future functionality may include the ability to infer
lists from data structures.
\ No newline at end of file
diff --git a/contrib/format-xml/pom.xml b/contrib/format-xml/pom.xml
new file mode 100644
index 0000000..a000f8e
--- /dev/null
+++ b/contrib/format-xml/pom.xml
@@ -0,0 +1,86 @@
+<?xml version="1.0"?>
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <artifactId>drill-contrib-parent</artifactId>
+ <groupId>org.apache.drill.contrib</groupId>
+ <version>1.19.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>drill-format-xml</artifactId>
+ <name>contrib/format-xml</name>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.drill.exec</groupId>
+ <artifactId>drill-java-exec</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <!-- Test dependencies -->
+ <dependency>
+ <groupId>org.apache.drill.exec</groupId>
+ <artifactId>drill-java-exec</artifactId>
+ <classifier>tests</classifier>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.drill</groupId>
+ <artifactId>drill-common</artifactId>
+ <classifier>tests</classifier>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-resources-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>copy-java-sources</id>
+ <phase>process-sources</phase>
+ <goals>
+ <goal>copy-resources</goal>
+ </goals>
+ <configuration>
+
<outputDirectory>${basedir}/target/classes/org/apache/drill/exec/store/xml
+ </outputDirectory>
+ <resources>
+ <resource>
+
<directory>src/main/java/org/apache/drill/exec/store/xml</directory>
+ <filtering>true</filtering>
+ </resource>
+ </resources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+</project>
\ No newline at end of file
diff --git
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLBatchReader.java
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLBatchReader.java
new file mode 100644
index 0000000..83f549f
--- /dev/null
+++
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLBatchReader.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.xml;
+
+import org.apache.drill.common.exceptions.CustomErrorContext;
+import org.apache.drill.common.exceptions.UserException;
+
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework;
+import
org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.physical.resultSet.ResultSetLoader;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.store.dfs.easy.EasySubScan;
+import org.apache.hadoop.mapred.FileSplit;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.InputStream;
+
+
+public class XMLBatchReader implements ManagedReader<FileSchemaNegotiator> {
+
+ private static final Logger logger =
LoggerFactory.getLogger(XMLBatchReader.class);
+
+ private FileSplit split;
+ private RowSetLoader rootRowWriter;
+ private CustomErrorContext errorContext;
+
+ private XMLReader reader;
+ private final int maxRecords;
+ private final int dataLevel;
+
+
+ static class XMLReaderConfig {
+ final XMLFormatPlugin plugin;
+ final int dataLevel;
+
+ XMLReaderConfig(XMLFormatPlugin plugin) {
+ this.plugin = plugin;
+ dataLevel = plugin.getConfig().dataLevel;
+ }
+ }
+
+ public XMLBatchReader(XMLReaderConfig readerConfig, EasySubScan scan) {
+ this.maxRecords = scan.getMaxRecords();
+ this.dataLevel = readerConfig.dataLevel;
+ }
+
+ @Override
+ public boolean open(FileSchemaNegotiator negotiator) {
+ split = negotiator.split();
+ ResultSetLoader loader = negotiator.build();
+ errorContext = negotiator.parentErrorContext();
+ rootRowWriter = loader.writer();
+
+ openFile(negotiator);
+ return true;
+ }
+
+ @Override
+ public boolean next() {
+ return reader.next();
+ }
+
+ @Override
+ public void close() {
+ reader.close();
+ }
+
+ private void openFile(FileScanFramework.FileSchemaNegotiator negotiator) {
+ try {
+ InputStream fsStream =
negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
+ reader = new XMLReader(fsStream, dataLevel, maxRecords);
+ reader.open(rootRowWriter, errorContext);
+ } catch (Exception e) {
+ throw UserException
+ .dataReadError(e)
+ .message("Failed to open open input file: {}",
split.getPath().toString())
+ .addContext(errorContext)
+ .addContext(e.getMessage())
+ .build(logger);
+ }
+ }
+}
diff --git
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLFormatConfig.java
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLFormatConfig.java
new file mode 100644
index 0000000..0babf20
--- /dev/null
+++
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLFormatConfig.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.xml;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+@JsonTypeName(XMLFormatPlugin.DEFAULT_NAME)
+@JsonInclude(JsonInclude.Include.NON_DEFAULT)
+public class XMLFormatConfig implements FormatPluginConfig {
+
+ public final List<String> extensions;
+ public final int dataLevel;
+
+ public XMLFormatConfig(@JsonProperty("extensions") List<String> extensions,
+ @JsonProperty("dataLevel") int dataLevel) {
+ this.extensions = extensions == null ? Collections.singletonList("xml") :
ImmutableList.copyOf(extensions);
+ this.dataLevel = Math.max(dataLevel, 1);
+ }
+
+ @JsonInclude(JsonInclude.Include.NON_DEFAULT)
+ public List<String> getExtensions() {
+ return extensions;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(extensions, dataLevel);
+ }
+
+ public XMLBatchReader.XMLReaderConfig getReaderConfig(XMLFormatPlugin
plugin) {
+ return new XMLBatchReader.XMLReaderConfig(plugin);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ XMLFormatConfig other = (XMLFormatConfig) obj;
+ return Objects.equals(extensions, other.extensions)
+ && Objects.equals(dataLevel, other.dataLevel);
+ }
+
+ @Override
+ public String toString() {
+ return new PlanStringBuilder(this)
+ .field("extensions", extensions)
+ .field("dataLevel", dataLevel)
+ .toString();
+ }
+}
\ No newline at end of file
diff --git
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLFormatPlugin.java
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLFormatPlugin.java
new file mode 100644
index 0000000..7cf3932
--- /dev/null
+++
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLFormatPlugin.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.xml;
+
+import org.apache.drill.common.logical.StoragePluginConfig;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.common.types.Types;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework;
+import
org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileScanBuilder;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.proto.UserBitShared.CoreOperatorType;
+import org.apache.drill.exec.server.DrillbitContext;
+import org.apache.drill.exec.server.options.OptionManager;
+import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin;
+import org.apache.drill.exec.store.dfs.easy.EasySubScan;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.apache.hadoop.conf.Configuration;
+
+
+public class XMLFormatPlugin extends EasyFormatPlugin<XMLFormatConfig> {
+
+ public static final String DEFAULT_NAME = "xml";
+
+ public static class XMLReaderFactory extends
FileScanFramework.FileReaderFactory {
+ private final XMLBatchReader.XMLReaderConfig readerConfig;
+ private final EasySubScan scan;
+
+ public XMLReaderFactory(XMLBatchReader.XMLReaderConfig config, EasySubScan
scan) {
+ this.readerConfig = config;
+ this.scan = scan;
+ }
+
+ @Override
+ public ManagedReader<? extends FileScanFramework.FileSchemaNegotiator>
newReader() {
+ return new XMLBatchReader(readerConfig, scan);
+ }
+ }
+
+ public XMLFormatPlugin(String name,
+ DrillbitContext context,
+ Configuration fsConf,
+ StoragePluginConfig storageConfig,
+ XMLFormatConfig formatConfig) {
+ super(name, easyConfig(fsConf, formatConfig), context, storageConfig,
formatConfig);
+ }
+
+ private static EasyFormatConfig easyConfig(Configuration fsConf,
XMLFormatConfig pluginConfig) {
+ EasyFormatConfig config = new EasyFormatConfig();
+ config.readable = true;
+ config.writable = false;
+ config.blockSplittable = false;
+ config.compressible = true;
+ config.supportsProjectPushdown = true;
+ config.extensions = Lists.newArrayList(pluginConfig.getExtensions());
+ config.fsConf = fsConf;
+ config.defaultName = DEFAULT_NAME;
+ config.readerOperatorType = CoreOperatorType.XML_SUB_SCAN_VALUE;
+ config.useEnhancedScan = true;
+ config.supportsLimitPushdown = true;
+ return config;
+ }
+
+ @Override
+ public ManagedReader<? extends FileScanFramework.FileSchemaNegotiator>
newBatchReader(
+ EasySubScan scan, OptionManager options) {
+ return new XMLBatchReader(formatConfig.getReaderConfig(this), scan);
+ }
+
+ @Override
+ protected FileScanFramework.FileScanBuilder frameworkBuilder(OptionManager
options, EasySubScan scan) {
+ FileScanBuilder builder = new FileScanBuilder();
+ builder.setReaderFactory(new XMLReaderFactory(new
XMLBatchReader.XMLReaderConfig(this), scan));
+ initScanBuilder(builder, scan);
+ builder.nullType(Types.optional(MinorType.VARCHAR));
+ return builder;
+ }
+}
diff --git
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLMap.java
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLMap.java
new file mode 100644
index 0000000..557762c
--- /dev/null
+++
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLMap.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.xml;
+
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.exec.vector.accessor.TupleWriter;
+
+import java.util.Objects;
+
+public class XMLMap {
+
+ private final String mapName;
+ private final TupleWriter mapWriter;
+
+ public XMLMap (String mapName, TupleWriter mapWriter) {
+ this.mapName = mapName;
+ this.mapWriter = mapWriter;
+ }
+
+ public TupleWriter getMapWriter() {
+ return mapWriter;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ XMLMap other = (XMLMap) obj;
+ return Objects.equals(mapName, other.mapName);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(mapName);
+ }
+
+ @Override
+ public String toString() {
+ return new PlanStringBuilder(this)
+ .field("Map Name", mapName)
+ .toString();
+ }
+}
diff --git
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
new file mode 100644
index 0000000..7665e6a
--- /dev/null
+++
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
@@ -0,0 +1,416 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.xml;
+
+import org.apache.drill.common.AutoCloseables;
+import org.apache.drill.common.exceptions.CustomErrorContext;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.TypeProtos.DataMode;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.record.metadata.ColumnMetadata;
+import org.apache.drill.exec.record.metadata.MetadataUtils;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.vector.accessor.ScalarWriter;
+import org.apache.drill.exec.vector.accessor.TupleWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.events.Attribute;
+import javax.xml.stream.events.StartElement;
+import javax.xml.stream.events.XMLEvent;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Stack;
+
+public class XMLReader {
+ private static final Logger logger =
LoggerFactory.getLogger(XMLReader.class);
+ private static final String ATTRIBUTE_MAP_NAME = "attributes";
+
+ private final Stack<String> fieldNameStack;
+ private final Stack<TupleWriter> rowWriterStack;
+ private final int dataLevel;
+ private final int maxRecords;
+ private final Map<String, XMLMap> nestedMapCollection;
+
+ private TupleWriter attributeWriter;
+ private CustomErrorContext errorContext;
+ private RowSetLoader rootRowWriter;
+ private int currentNestingLevel;
+ private XMLEvent currentEvent;
+ private String rootDataFieldName;
+ private String fieldName;
+ private xmlState currentState;
+ private TupleWriter currentTupleWriter;
+ private boolean rowStarted;
+ private String attributePrefix;
+ private String fieldValue;
+ private InputStream fsStream;
+ private XMLEventReader reader;
+
+ /**
+ * This field indicates the various states in which the reader operates. The
names should be self explanatory,
+ * but they are used as the reader iterates over the XML tags to know what
to do.
+ */
+ private enum xmlState {
+ ROW_STARTED,
+ POSSIBLE_MAP,
+ NESTED_MAP_STARTED,
+ GETTING_DATA,
+ WRITING_DATA,
+ FIELD_ENDED,
+ ROW_ENDED
+ }
+
+ public XMLReader(InputStream fsStream, int dataLevel, int maxRecords) throws
XMLStreamException {
+ this.fsStream = fsStream;
+ XMLInputFactory inputFactory = XMLInputFactory.newInstance();
+ reader = inputFactory.createXMLEventReader(fsStream);
+ fieldNameStack = new Stack<>();
+ rowWriterStack = new Stack<>();
+ nestedMapCollection = new HashMap<>();
+ this.dataLevel = dataLevel;
+ this.maxRecords = maxRecords;
+
+ }
+
+ public void open(RowSetLoader rootRowWriter, CustomErrorContext errorContext
) {
+ this.errorContext = errorContext;
+ this.rootRowWriter = rootRowWriter;
+ attributeWriter = getAttributeWriter();
+ }
+
+ public boolean next() {
+ while (!rootRowWriter.isFull()) {
+ try {
+ if (!processElements()) {
+ return false;
+ }
+ } catch (Exception e) {
+ throw UserException
+ .dataReadError(e)
+ .message("Error parsing file: " + e.getMessage())
+ .addContext(errorContext)
+ .build(logger);
+ }
+ }
+ return true;
+ }
+
+
+ public void close() {
+ if (fsStream != null) {
+ AutoCloseables.closeSilently(fsStream);
+ fsStream = null;
+ }
+
+ if (reader != null) {
+ try {
+ reader.close();
+ } catch (XMLStreamException e) {
+ logger.warn("Error when closing XML stream: {}", e.getMessage());
+ }
+ reader = null;
+ }
+ }
+
+ /**
+ * This function processes the XML elements. This function stops reading
when the
+ * limit (if any) which came from the query has been reached or the Iterator
runs out of
+ * elements.
+ * @return True if there are more elements to parse, false if not
+ */
+ private boolean processElements() {
+ XMLEvent nextEvent;
+
+ if (!reader.hasNext()) {
+ // Stop reading if there are no more results
+ return false;
+ } else if (rootRowWriter.limitReached(maxRecords)) {
+ // Stop if the query limit has been reached
+ return false;
+ }
+
+ // Iterate over XML events
+ while (reader.hasNext()) {
+ // get the current event
+ try {
+ nextEvent = reader.nextEvent();
+
+ // If the next event is whitespace, newlines, or other cruft that we
don't need
+ // ignore and move to the next event
+ if (XMLUtils.isEmptyWhiteSpace(nextEvent)) {
+ continue;
+ }
+
+ // Capture the previous and current event
+ XMLEvent lastEvent = currentEvent;
+ currentEvent = nextEvent;
+
+ // Process the event
+ processEvent(currentEvent, lastEvent);
+ } catch (XMLStreamException e) {
+ throw UserException
+ .dataReadError(e)
+ .message("Error parsing XML file: " + e.getMessage())
+ .addContext(errorContext)
+ .build(logger);
+ }
+ }
+ return true;
+ }
+
+ /**
+ * This function processes an actual XMLEvent. There are three possibilities:
+ * 1. The event is a start event
+ * 2. The event contains text
+ * 3. The event is a closing tag
+ * There are other possible elements, but they are not relevant for our
purposes.
+ *
+ * @param currentEvent The current event to be processed
+ * @param lastEvent The previous event which was processed
+ */
+ private void processEvent(XMLEvent currentEvent,
+ XMLEvent lastEvent) {
+ String mapName;
+ switch (currentEvent.getEventType()) {
+
+ /*
+ * This case handles start elements.
+ * Case 1: The current nesting level is less than the data level.
+ * In this case, increase the nesting level and stop processing.
+ *
+ * Case 2: The nesting level is higher than the data level.
+ * In this case, a few things must happen.
+ * 1. We capture the field name
+ * 2. If the row has not started, we start the row
+ * 3. Set the possible map flag
+ * 4. Process attributes
+ * 5. Push both the field name and writer to the stacks
+ */
+ case XMLStreamConstants.START_ELEMENT:
+ currentNestingLevel++;
+
+ // Case 1: Current nesting level is less than the data level
+ if (currentNestingLevel < dataLevel) {
+ // Stop here if the current level of nesting has not reached the
data.
+ break;
+ }
+
+ StartElement startElement = currentEvent.asStartElement();
+ // Get the field name
+ fieldName = startElement.getName().getLocalPart();
+
+ if (rootDataFieldName == null && currentNestingLevel == dataLevel) {
+ rootDataFieldName = fieldName;
+ logger.debug("Root field name: {}", rootDataFieldName);
+ }
+
+ if (!rowStarted) {
+ currentTupleWriter = startRow(rootRowWriter);
+ } else {
+ if (lastEvent!= null &&
+ lastEvent.getEventType() == XMLStreamConstants.START_ELEMENT) {
+ /*
+ * Check the flag in the next section. If the next element is a
character AND the flag is set,
+ * start a map. If not... ignore it all.
+ */
+ changeState(xmlState.POSSIBLE_MAP);
+
+ rowWriterStack.push(currentTupleWriter);
+ }
+
+ fieldNameStack.push(fieldName);
+ if (currentNestingLevel > dataLevel) {
+ attributePrefix = XMLUtils.addField(attributePrefix, fieldName);
+ }
+
+ Iterator<Attribute> attributes = startElement.getAttributes();
+ writeAttributes(attributePrefix, attributes);
+ }
+ break;
+
+ /*
+ * This case processes character elements.
+ */
+ case XMLStreamConstants.CHARACTERS:
+ if (currentState == xmlState.ROW_ENDED) {
+ break;
+ }
+
+ // Get the field value but ignore characters outside of rows
+ if (rowStarted) {
+ if (currentState == xmlState.POSSIBLE_MAP && currentNestingLevel >
dataLevel +1) {
+ changeState(xmlState.NESTED_MAP_STARTED);
+
+ // Remove the current field name from the stack
+ if (fieldNameStack.size() > 1) {
+ fieldNameStack.pop();
+ }
+ // Get the map name and push to stack
+ mapName = fieldNameStack.pop();
+ currentTupleWriter = getMapWriter(mapName, currentTupleWriter);
+ } else {
+ changeState(xmlState.ROW_STARTED);
+ }
+ }
+
+ fieldValue = currentEvent.asCharacters().getData().trim();
+ changeState(xmlState.GETTING_DATA);
+ break;
+
+ case XMLStreamConstants.END_ELEMENT:
+ currentNestingLevel--;
+ // End the row
+ if (currentNestingLevel < dataLevel - 1) {
+ break;
+ } else if
(currentEvent.asEndElement().getName().toString().compareTo(rootDataFieldName)
== 0) {
+ currentTupleWriter = endRow();
+
+ // Clear stacks
+ rowWriterStack.clear();
+ fieldNameStack.clear();
+ attributePrefix = "";
+
+ } else if (currentState == xmlState.FIELD_ENDED && currentNestingLevel
>= dataLevel) {
+ // Case to end nested maps
+ // Pop tupleWriter off stack
+ currentTupleWriter = rowWriterStack.pop();
+ attributePrefix = XMLUtils.removeField(attributePrefix);
+
+ } else if (currentState != xmlState.ROW_ENDED){
+ writeFieldData(fieldName, fieldValue, currentTupleWriter);
+ // Clear out field name and value
+ fieldName = null;
+ fieldValue = null;
+ attributePrefix = XMLUtils.removeField(attributePrefix);
+ }
+ break;
+ }
+ }
+
+ private TupleWriter startRow(RowSetLoader writer) {
+ if (currentNestingLevel == dataLevel) {
+ rootRowWriter.start();
+ rowStarted = true;
+ rowWriterStack.push(rootRowWriter);
+ changeState(xmlState.ROW_STARTED);
+ return rootRowWriter;
+ } else {
+ rowStarted = false;
+ return writer;
+ }
+ }
+
+ /**
+ * This method executes the steps to end a row from an XML dataset.
+ * @return the root row writer
+ */
+ private TupleWriter endRow() {
+ logger.debug("Ending row");
+ rootRowWriter.save();
+ rowStarted = false;
+ changeState(xmlState.ROW_ENDED);
+ return rootRowWriter;
+ }
+
+ /**
+ * Writes a field. If the field does not have a corresponding ScalarWriter,
this method will
+ * create one.
+ * @param fieldName The field name
+ * @param fieldValue The field value to be written
+ * @param writer The TupleWriter which represents
+ */
+ private void writeFieldData(String fieldName, String fieldValue, TupleWriter
writer) {
+ if (fieldName == null) {
+ return;
+ }
+
+ changeState(xmlState.WRITING_DATA);
+
+ // Find the TupleWriter object
+ int index = writer.tupleSchema().index(fieldName);
+ if (index == -1) {
+ ColumnMetadata colSchema = MetadataUtils.newScalar(fieldName,
TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL);
+ index = writer.addColumn(colSchema);
+ }
+ ScalarWriter colWriter = writer.scalar(index);
+ if (fieldValue != null && (currentState != xmlState.ROW_ENDED &&
currentState != xmlState.FIELD_ENDED)) {
+ colWriter.setString(fieldValue);
+ changeState(xmlState.FIELD_ENDED);
+ }
+ }
+
+ /**
+ * Returns a MapWriter for a given field. If the writer does not exist, add
one to the schema
+ * @param mapName The Map's name
+ * @param rowWriter The current TupleWriter
+ * @return A TupleWriter of the new map
+ */
+ private TupleWriter getMapWriter(String mapName, TupleWriter rowWriter) {
+ logger.debug("Adding map: {}", mapName);
+ int index = rowWriter.tupleSchema().index(mapName);
+ if (index == -1) {
+ // Check to see if the map already exists in the map collection
+ // This condition can occur in deeply nested data.
+ String tempFieldName = mapName + "-" + currentNestingLevel;
+ XMLMap mapObject = nestedMapCollection.get(tempFieldName);
+ if (mapObject != null) {
+ logger.debug("Found map {}", tempFieldName);
+ return mapObject.getMapWriter();
+ }
+
+ index = rowWriter.addColumn(SchemaBuilder.columnSchema(mapName,
MinorType.MAP, DataMode.REQUIRED));
+ // Add map to map collection for future use
+ nestedMapCollection.put(tempFieldName, new XMLMap(mapName,
rowWriter.tuple(index)));
+ }
+ return rowWriter.tuple(index);
+ }
+
+ private void changeState(xmlState newState) {
+ xmlState previousState = currentState;
+ currentState = newState;
+ }
+
+ private TupleWriter getAttributeWriter() {
+ int attributeIndex =
rootRowWriter.addColumn(SchemaBuilder.columnSchema(ATTRIBUTE_MAP_NAME,
MinorType.MAP, DataMode.REQUIRED));
+ return rootRowWriter.tuple(attributeIndex);
+ }
+
+ /**
+ * Helper function which writes attributes of an XML element.
+ * @param prefix The attribute prefix
+ * @param attributes An iterator of Attribute objects
+ */
+ private void writeAttributes(String prefix, Iterator<Attribute> attributes) {
+ while (attributes.hasNext()) {
+ Attribute currentAttribute = attributes.next();
+ String key = prefix + "_" + currentAttribute.getName().toString();
+ writeFieldData(key, currentAttribute.getValue(), attributeWriter);
+ }
+ }
+
+}
diff --git
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLUtils.java
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLUtils.java
new file mode 100644
index 0000000..b7ea8e7
--- /dev/null
+++
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLUtils.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.xml;
+
+import org.apache.drill.shaded.guava.com.google.common.base.Strings;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.events.XMLEvent;
+
+public class XMLUtils {
+
+ /**
+ * Empty events are not helpful so this method checks to see if the event
consists solely of whitespace
+ * or newline characters. Unfortunately, newlines and other extraneous
characters are treated as new elements, so
+ * this function wraps a lot of those checks in one function.
+ * @param event The input XMLEvent
+ * @return True if the XMLEvent is only whitespace, false if not.
+ */
+ public static boolean isEmptyWhiteSpace(XMLEvent event) {
+ if (event.getEventType() == XMLStreamConstants.COMMENT) {
+ return true;
+ } else if (event.getEventType() != XMLStreamConstants.CHARACTERS) {
+ return false;
+ }
+
+ String value = event.asCharacters().getData();
+ if (Strings.isNullOrEmpty(value.trim())) {
+ return true;
+ } else {
+ return event.asCharacters().isIgnorableWhiteSpace();
+ }
+ }
+
+ /**
+ * Identifies XML events that may be populated but are not useful for
extracting data.
+ * @param event The XMLEvent in question
+ * @return True if the event is useful, false if not
+ */
+ public static boolean isNotCruft(XMLEvent event) {
+ int eventType = event.getEventType();
+ return eventType == XMLStreamConstants.CHARACTERS ||
+ eventType == XMLStreamConstants.START_ELEMENT ||
+ eventType == XMLStreamConstants.END_ELEMENT;
+ }
+
+ /**
+ * Generates a nested field name by combining a field prefix to the current
field name.
+ * @param prefix The prefix to be added to the field name.
+ * @param field The field name
+ * @return the prefix, followed by an underscore and the fieldname.
+ */
+ public static String addField(String prefix, String field) {
+ if (Strings.isNullOrEmpty(prefix)) {
+ return field;
+ }
+ return prefix + "_" + field;
+ }
+
+ /**
+ * Returns the field name from nested field names
+ * @param fieldName The nested field name
+ * @return The field name
+ */
+ public static String removeField(String fieldName) {
+ String[] components = fieldName.split("_");
+ StringBuilder newField = new StringBuilder();
+ for (int i = 0; i < components.length - 1; i++) {
+ if (i > 0) {
+ newField.append("_").append(components[i]);
+ } else {
+ newField = new StringBuilder(components[i]);
+ }
+ }
+ return newField.toString();
+ }
+
+}
diff --git
a/contrib/format-xml/src/main/resources/bootstrap-format-plugins.json
b/contrib/format-xml/src/main/resources/bootstrap-format-plugins.json
new file mode 100644
index 0000000..ef5f59c
--- /dev/null
+++ b/contrib/format-xml/src/main/resources/bootstrap-format-plugins.json
@@ -0,0 +1,26 @@
+{
+ "storage":{
+ "dfs": {
+ "type": "file",
+ "formats": {
+ "xml": {
+ "type": "xml",
+ "extensions": [
+ "xml"
+ ]
+ }
+ }
+ },
+ "s3": {
+ "type": "file",
+ "formats": {
+ "xml": {
+ "type": "xml",
+ "extensions": [
+ "xml"
+ ]
+ }
+ }
+ }
+ }
+}
diff --git a/contrib/format-xml/src/main/resources/drill-module.conf
b/contrib/format-xml/src/main/resources/drill-module.conf
new file mode 100644
index 0000000..04406a3
--- /dev/null
+++ b/contrib/format-xml/src/main/resources/drill-module.conf
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This file tells Drill to consider this module when class path scanning.
+# This file can also include any supplementary configuration information.
+# This file is in HOCON format, see
https://github.com/typesafehub/config/blob/master/HOCON.md for more information.
+
+drill.classpath.scanning: {
+ packages += "org.apache.drill.exec.store.xml"
+}
diff --git
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
new file mode 100644
index 0000000..b515dab
--- /dev/null
+++
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
@@ -0,0 +1,428 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.xml;
+
+import org.apache.drill.categories.RowSetTests;
+import org.apache.drill.common.types.TypeProtos.DataMode;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.physical.rowSet.RowSet;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterTest;
+import org.apache.drill.test.rowSet.RowSetComparison;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.nio.file.Paths;
+
+import static org.apache.drill.test.QueryTestUtil.generateCompressedFile;
+import static org.apache.drill.test.rowSet.RowSetUtilities.mapArray;
+import static org.apache.drill.test.rowSet.RowSetUtilities.objArray;
+import static org.apache.drill.test.rowSet.RowSetUtilities.strArray;
+import static org.junit.Assert.assertEquals;
+
+@Category(RowSetTests.class)
+public class TestXMLReader extends ClusterTest {
+
+ @BeforeClass
+ public static void setup() throws Exception {
+ ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher));
+
+ XMLFormatConfig formatConfig = new XMLFormatConfig(null, 2);
+ cluster.defineFormat("cp", "xml", formatConfig);
+ cluster.defineFormat("dfs", "xml", formatConfig);
+
+ // Needed for compressed file unit test
+ dirTestWatcher.copyResourceToRoot(Paths.get("xml/"));
+ }
+
+ /**
+ * This unit test tests a simple XML file with no nesting or attributes
+ * @throws Exception Throw exception if anything goes wrong
+ */
+ @Test
+ public void testWildcard() throws Exception {
+ String sql = "SELECT * FROM cp.`xml/simple.xml`";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+ assertEquals(3, results.rowCount());
+
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .add("attributes", MinorType.MAP)
+ .addNullable("groupID", MinorType.VARCHAR)
+ .addNullable("artifactID", MinorType.VARCHAR)
+ .addNullable("version", MinorType.VARCHAR)
+ .addNullable("classifier", MinorType.VARCHAR)
+ .addNullable("scope", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow(mapArray(), "org.apache.drill.exec", "drill-java-exec",
"${project.version}", null, null)
+ .addRow(mapArray(),"org.apache.drill.exec", "drill-java-exec",
"${project.version}", "tests", "test")
+ .addRow(mapArray(),"org.apache.drill", "drill-common",
"${project.version}", "tests", "test")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ /**
+ * This unit test tests a simple XML file with no nesting or attributes, but
with explicitly selected fields.
+ * @throws Exception Throw exception if anything goes wrong
+ */
+ @Test
+ public void testExplicitWithSimpleXMLFile() throws Exception {
+ String sql = "SELECT groupID, artifactID, version, classifier, scope FROM
cp.`xml/simple.xml`";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ assertEquals(3, results.rowCount());
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("groupID", MinorType.VARCHAR)
+ .addNullable("artifactID", MinorType.VARCHAR)
+ .addNullable("version", MinorType.VARCHAR)
+ .addNullable("classifier", MinorType.VARCHAR)
+ .addNullable("scope", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow("org.apache.drill.exec", "drill-java-exec",
"${project.version}", null, null)
+ .addRow("org.apache.drill.exec", "drill-java-exec",
"${project.version}", "tests", "test")
+ .addRow("org.apache.drill", "drill-common", "${project.version}",
"tests", "test")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testWildcardWithFilter() throws Exception {
+ String sql = "SELECT * FROM cp.`xml/simple.xml` WHERE scope='test'";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+ assertEquals(2, results.rowCount());
+
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .add("attributes", MinorType.MAP)
+ .addNullable("groupID", MinorType.VARCHAR)
+ .addNullable("artifactID", MinorType.VARCHAR)
+ .addNullable("version", MinorType.VARCHAR)
+ .addNullable("classifier", MinorType.VARCHAR)
+ .addNullable("scope", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow(mapArray(),"org.apache.drill.exec", "drill-java-exec",
"${project.version}", "tests", "test")
+ .addRow(mapArray(),"org.apache.drill", "drill-common",
"${project.version}", "tests", "test")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testWildcardWithSingleNestedDataField() throws Exception {
+ String sql = "SELECT * FROM cp.`xml/really-simple-nested.xml`";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+ assertEquals(3, results.rowCount());
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .add("attributes", MinorType.MAP, DataMode.REQUIRED)
+ .addMap("field1")
+ .addNullable("key1", MinorType.VARCHAR)
+ .addNullable("key2", MinorType.VARCHAR)
+ .resumeSchema()
+ .buildSchema();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow(mapArray(), strArray("value1", "value2"))
+ .addRow(mapArray(), strArray("value3", "value4"))
+ .addRow(mapArray(), strArray("value5", "value6"))
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testExplicitWithSingleNestedDataField() throws Exception {
+ String sql = "SELECT t1.field1.key1 as key1, t1.field1.key2 as key2 FROM
cp.`xml/really-simple-nested.xml` as t1";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+ assertEquals(3, results.rowCount());
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("key1", MinorType.VARCHAR)
+ .addNullable("key2", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow("value1", "value2")
+ .addRow("value3", "value4")
+ .addRow("value5", "value6")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testSerDe() throws Exception {
+ String sql = "SELECT COUNT(*) FROM cp.`xml/simple.xml`";
+ String plan = queryBuilder().sql(sql).explainJson();
+ long cnt = queryBuilder().physical(plan).singletonLong();
+ assertEquals("Counts should match", 3L, cnt);
+ }
+
+ @Test
+ public void testExplicitWithCompressedSimpleXMLFile() throws Exception {
+ generateCompressedFile("xml/simple.xml", "zip", "xml/simple.xml.zip");
+
+ String sql = "SELECT groupID, artifactID, version, classifier, scope FROM
dfs.`xml/simple.xml.zip`";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ assertEquals(3, results.rowCount());
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("groupID", MinorType.VARCHAR)
+ .addNullable("artifactID", MinorType.VARCHAR)
+ .addNullable("version", MinorType.VARCHAR)
+ .addNullable("classifier", MinorType.VARCHAR)
+ .addNullable("scope", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow("org.apache.drill.exec", "drill-java-exec",
"${project.version}", null, null)
+ .addRow("org.apache.drill.exec", "drill-java-exec",
"${project.version}", "tests", "test")
+ .addRow("org.apache.drill", "drill-common", "${project.version}",
"tests", "test")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testDeepNestedSpecificFields() throws Exception {
+ String sql = "select xml.level2.level3.level4.level5.level6.level7.field1
as field1, xml.level2.level3.level4.level5.level6.level7.field2 as field2,
xml.level2.level3.level4" +
+ ".level5.level6.level7.field3 as field3 FROM cp.`xml/deep-nested.xml` as
xml";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ assertEquals(2, results.rowCount());
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("field1", MinorType.VARCHAR)
+ .addNullable("field2", MinorType.VARCHAR)
+ .addNullable("field3", MinorType.VARCHAR)
+ .build();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow("f1", "f2", "f3")
+ .addRow("f4", "f5", "f6")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testDeepNesting() throws Exception {
+ String sql = "SELECT * FROM cp.`xml/deep-nested.xml`";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ assertEquals(2, results.rowCount());
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .add("attributes", MinorType.MAP, DataMode.REQUIRED)
+ .addMap("level2")
+ .addNullable("field1-level2", MinorType.VARCHAR)
+ .addMap("level3")
+ .addNullable("field1-level3", MinorType.VARCHAR)
+ .addMap("level4")
+ .addNullable("field1-level4", MinorType.VARCHAR)
+ .addMap("level5")
+ .addNullable("field1-level5", MinorType.VARCHAR)
+ .addMap("level6")
+ .addNullable("field1-level6", MinorType.VARCHAR)
+ .addMap("level7")
+ .addNullable("field1", MinorType.VARCHAR)
+ .addNullable("field2", MinorType.VARCHAR)
+ .addNullable("field3", MinorType.VARCHAR)
+ .resumeMap() // End level 7
+ .resumeMap() // End level 6
+ .resumeMap() // End level 5
+ .resumeMap() // End level 4
+ .resumeMap() // End level 3
+ .resumeSchema()
+ .build();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow(mapArray(), objArray(
+ objArray(
+ "l2",
+ objArray("l3",
+ objArray("l4",
+ objArray("l5",
+ objArray("l6",
+ strArray("f1", "f2", "f3")
+ )
+ )
+ )
+ )
+ )
+ ))
+ .addRow(mapArray(), objArray(
+ objArray(
+ null,
+ objArray(null,
+ objArray(null,
+ objArray(null,
+ objArray(null,
+ strArray("f4", "f5", "f6")
+ )
+ )
+ )
+ )
+ )
+ ))
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testDataLevel() throws Exception {
+ String sql = "SELECT * FROM table(cp.`xml/deep-nested2.xml` (type =>
'xml', dataLevel => 8))";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .add("attributes", MinorType.MAP, DataMode.REQUIRED)
+ .addNullable("field1", MinorType.VARCHAR)
+ .addNullable("field2", MinorType.VARCHAR)
+ .addNullable("field3", MinorType.VARCHAR)
+ .addNullable("field1-level6", MinorType.VARCHAR)
+ .build();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow(mapArray(), "f4", "f5", "f6", null)
+ .addRow(mapArray(), "f1", "f2", "f3", "l6")
+ .build();
+
+ assertEquals(2, results.rowCount());
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testExplicitDataLevel() throws Exception {
+ String sql = "SELECT field1, field2, field3 FROM
table(cp.`xml/deep-nested2.xml` (type => 'xml', dataLevel => 8))";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("field1", MinorType.VARCHAR)
+ .addNullable("field2", MinorType.VARCHAR)
+ .addNullable("field3", MinorType.VARCHAR)
+ .build();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow("f4", "f5", "f6")
+ .addRow("f1", "f2", "f3")
+ .build();
+
+ assertEquals(2, results.rowCount());
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testComplexWildcardStar() throws Exception {
+ String sql = "SELECT * FROM cp.`xml/nested.xml`";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .add("attributes", MinorType.MAP, DataMode.REQUIRED)
+ .addMap("field1")
+ .addNullable("key1", MinorType.VARCHAR)
+ .addNullable("key2", MinorType.VARCHAR)
+ .resumeSchema()
+ .addMap("field2")
+ .addNullable("key3", MinorType.VARCHAR)
+ .addMap("nestedField1")
+ .addNullable("nk1", MinorType.VARCHAR)
+ .addNullable("nk2", MinorType.VARCHAR)
+ .addNullable("nk3", MinorType.VARCHAR)
+ .resumeMap()
+ .resumeSchema()
+ .buildSchema();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow(mapArray(), strArray("value1", "value2"), objArray("k1",
strArray("nk_value1", "nk_value2", "nk_value3")))
+ .addRow(mapArray(), strArray("value3", "value4"), objArray("k2",
strArray("nk_value4", "nk_value5", "nk_value6")))
+ .addRow(mapArray(), strArray("value5", "value6"), objArray("k3",
strArray("nk_value7", "nk_value8", "nk_value9")))
+ .build();
+
+ assertEquals(3, results.rowCount());
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testComplexNestedExplicit() throws Exception {
+ String sql = "SELECT xml.field2.nestedField1.nk1 as nk1,
xml.field2.nestedField1.nk2 as nk2, xml.field2.nestedField1.nk3 as nk3 FROM
cp.`xml/nested.xml` AS xml";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("nk1", MinorType.VARCHAR)
+ .addNullable("nk2", MinorType.VARCHAR)
+ .addNullable("nk3", MinorType.VARCHAR)
+ .build();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow("nk_value1", "nk_value2", "nk_value3")
+ .addRow("nk_value4", "nk_value5", "nk_value6")
+ .addRow("nk_value7", "nk_value8", "nk_value9")
+ .build();
+
+ assertEquals(3, results.rowCount());
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testAttributes() throws Exception {
+ String sql = "SELECT attributes FROM cp.`xml/attributes.xml`";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addMap("attributes")
+ .addNullable("title_binding", MinorType.VARCHAR)
+ .addNullable("title_subcategory", MinorType.VARCHAR)
+ .resumeSchema()
+ .build();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow((Object) mapArray(null, null))
+ .addRow((Object) strArray("paperback", null))
+ .addRow((Object) strArray("hardcover", "non-fiction"))
+ .build();
+
+ assertEquals(3, results.rowCount());
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testLimitPushdown() throws Exception {
+ String sql = "SELECT * FROM cp.`xml/simple.xml` LIMIT 2";
+
+ queryBuilder()
+ .sql(sql)
+ .planMatcher()
+ .include("Limit", "maxRecords=2")
+ .match();
+ }
+}
diff --git a/contrib/format-xml/src/test/resources/xml/attributes.xml
b/contrib/format-xml/src/test/resources/xml/attributes.xml
new file mode 100644
index 0000000..a44eca0
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/attributes.xml
@@ -0,0 +1,42 @@
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<books>
+ <book>
+ <author>Mark Twain</author>
+ <title>The Adventures of Tom Sawyer</title>
+ <category>FICTION</category>
+ <year>1876</year>
+ </book>
+ <book>
+ <authors>
+ <author>Niklaus Wirth</author>
+ <author>Somebody else</author>
+ </authors>
+ <title binding="paperback">The Programming Language Pascal</title>
+ <category >PASCAL</category>
+ <year>1971</year>
+ </book>
+ <book>
+ <author>O.-J. Dahl</author>
+ <title binding="hardcover" subcategory="non-fiction">Structured
Programming</title>
+ <category>PROGRAMMING</category>
+ <year>1972</year>
+ </book>
+</books>
\ No newline at end of file
diff --git a/contrib/format-xml/src/test/resources/xml/deep-nested.xml
b/contrib/format-xml/src/test/resources/xml/deep-nested.xml
new file mode 100644
index 0000000..2d28289
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/deep-nested.xml
@@ -0,0 +1,60 @@
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<root>
+ <level1>
+ <level2>
+ <field1-level2>l2</field1-level2>
+ <level3>
+ <field1-level3>l3</field1-level3>
+ <level4>
+ <field1-level4>l4</field1-level4>
+ <level5>
+ <field1-level5>l5</field1-level5>
+ <level6>
+ <field1-level6>l6</field1-level6>
+ <level7>
+ <field1>f1</field1>
+ <field2>f2</field2>
+ <field3>f3</field3>
+ </level7>
+ </level6>
+ </level5>
+ </level4>
+ </level3>
+ </level2>
+ </level1>
+ <level1>
+ <level2>
+ <level3>
+ <level4>
+ <level5>
+ <level6>
+ <level7>
+ <field1>f4</field1>
+ <field2>f5</field2>
+ <field3>f6</field3>
+ </level7>
+ </level6>
+ </level5>
+ </level4>
+ </level3>
+ </level2>
+ </level1>
+</root>
\ No newline at end of file
diff --git a/contrib/format-xml/src/test/resources/xml/deep-nested2.xml
b/contrib/format-xml/src/test/resources/xml/deep-nested2.xml
new file mode 100644
index 0000000..0a1b787
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/deep-nested2.xml
@@ -0,0 +1,60 @@
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<root>
+ <level1>
+ <level2>
+ <level3>
+ <level4>
+ <level5>
+ <level6>
+ <level7>
+ <field1>f4</field1>
+ <field2>f5</field2>
+ <field3>f6</field3>
+ </level7>
+ </level6>
+ </level5>
+ </level4>
+ </level3>
+ </level2>
+ </level1>
+ <level1>
+ <level2>
+ <field1-level2>l2</field1-level2>
+ <level3>
+ <field1-level3>l3</field1-level3>
+ <level4>
+ <field1-level4>l4</field1-level4>
+ <level5>
+ <field1-level5>l5</field1-level5>
+ <level6>
+ <field1-level6>l6</field1-level6>
+ <level7>
+ <field1>f1</field1>
+ <field2>f2</field2>
+ <field3>f3</field3>
+ </level7>
+ </level6>
+ </level5>
+ </level4>
+ </level3>
+ </level2>
+ </level1>
+</root>
\ No newline at end of file
diff --git a/contrib/format-xml/src/test/resources/xml/nested.xml
b/contrib/format-xml/src/test/resources/xml/nested.xml
new file mode 100644
index 0000000..da94687
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/nested.xml
@@ -0,0 +1,63 @@
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<books>
+ <book>
+ <field1>
+ <key1>value1</key1>
+ <key2>value2</key2>
+ </field1>
+ <field2>
+ <key3>k1</key3>
+ <nestedField1>
+ <nk1>nk_value1</nk1>
+ <nk2>nk_value2</nk2>
+ <nk3>nk_value3</nk3>
+ </nestedField1>
+ </field2>
+ </book>
+ <book>
+ <field1>
+ <key1>value3</key1>
+ <key2>value4</key2>
+ </field1>
+ <field2>
+ <key3>k2</key3>
+ <nestedField1>
+ <nk1>nk_value4</nk1>
+ <nk2>nk_value5</nk2>
+ <nk3>nk_value6</nk3>
+ </nestedField1>
+ </field2>
+ </book>
+ <book>
+ <field1>
+ <key1>value5</key1>
+ <key2>value6</key2>
+ </field1>
+ <field2>
+ <key3>k3</key3>
+ <nestedField1>
+ <nk1>nk_value7</nk1>
+ <nk2>nk_value8</nk2>
+ <nk3>nk_value9</nk3>
+ </nestedField1>
+ </field2>
+ </book>
+</books>
\ No newline at end of file
diff --git a/contrib/format-xml/src/test/resources/xml/really-simple-nested.xml
b/contrib/format-xml/src/test/resources/xml/really-simple-nested.xml
new file mode 100644
index 0000000..5bb1d18
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/really-simple-nested.xml
@@ -0,0 +1,39 @@
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<books>
+ <book>
+ <field1>
+ <key1>value1</key1>
+ <key2>value2</key2>
+ </field1>
+ </book>
+ <book>
+ <field1>
+ <key1>value3</key1>
+ <key2>value4</key2>
+ </field1>
+ </book>
+ <book>
+ <field1>
+ <key1>value5</key1>
+ <key2>value6</key2>
+ </field1>
+ </book>
+</books>
\ No newline at end of file
diff --git a/contrib/format-xml/src/test/resources/xml/simple.xml
b/contrib/format-xml/src/test/resources/xml/simple.xml
new file mode 100644
index 0000000..f651ed6
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/simple.xml
@@ -0,0 +1,42 @@
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<dependencies>
+ <dependency>
+ <groupId>org.apache.drill.exec</groupId>
+ <artifactId>drill-java-exec</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.drill.exec</groupId>
+ <artifactId>drill-java-exec</artifactId>
+ <classifier>tests</classifier>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.drill</groupId>
+ <artifactId>drill-common</artifactId>
+ <classifier>tests</classifier>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+</dependencies>
\ No newline at end of file
diff --git a/contrib/format-xml/src/test/resources/xml/simple_schema.xsd
b/contrib/format-xml/src/test/resources/xml/simple_schema.xsd
new file mode 100644
index 0000000..df825b3
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/simple_schema.xsd
@@ -0,0 +1,43 @@
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
+ xmlns:tns="http://tempuri.org/PurchaseOrderSchema.xsd"
+ targetNamespace="http://tempuri.org/PurchaseOrderSchema.xsd"
+ elementFormDefault="qualified">
+ <xsd:element name="PurchaseOrder" type="tns:PurchaseOrderType"/>
+ <xsd:complexType name="PurchaseOrderType">
+ <xsd:sequence>
+ <xsd:element name="ShipTo" type="tns:USAddress" maxOccurs="2"/>
+ <xsd:element name="BillTo" type="tns:USAddress"/>
+ </xsd:sequence>
+ <xsd:attribute name="OrderDate" type="xsd:date"/>
+ </xsd:complexType>
+
+ <xsd:complexType name="USAddress">
+ <xsd:sequence>
+ <xsd:element name="name" type="xsd:string"/>
+ <xsd:element name="street" type="xsd:string"/>
+ <xsd:element name="city" type="xsd:string"/>
+ <xsd:element name="state" type="xsd:string"/>
+ <xsd:element name="zip" type="xsd:integer"/>
+ </xsd:sequence>
+ <xsd:attribute name="country" type="xsd:NMTOKEN" fixed="US"/>
+ </xsd:complexType>
+</xsd:schema>
\ No newline at end of file
diff --git a/contrib/format-xml/src/test/resources/xml/very-nested.xml
b/contrib/format-xml/src/test/resources/xml/very-nested.xml
new file mode 100644
index 0000000..bdf8a0c
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/very-nested.xml
@@ -0,0 +1,38 @@
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<book>
+ <field1>
+ <key1>value1</key1>
+ <key2>value2</key2>
+ </field1>
+ <field2>
+ <key3>k1</key3>
+ <nestedField1>
+ <nk1>nk_value1</nk1>
+ <nk2>nk_value2</nk2>
+ <nk3>nk_value3</nk3>
+ <nestedField2>
+ <nk1>nk2_value1</nk1>
+ <nk2>nk2_value2</nk2>
+ <nk3>nk2_value3</nk3>
+ </nestedField2>
+ </nestedField1>
+ </field2>
+</book>
\ No newline at end of file
diff --git a/contrib/native/client/src/protobuf/UserBitShared.pb.cc
b/contrib/native/client/src/protobuf/UserBitShared.pb.cc
index 6dbd625..26b2e4f 100644
--- a/contrib/native/client/src/protobuf/UserBitShared.pb.cc
+++ b/contrib/native/client/src/protobuf/UserBitShared.pb.cc
@@ -956,7 +956,7 @@ const char
descriptor_table_protodef_UserBitShared_2eproto[] PROTOBUF_SECTION_VA
"ATEMENT\020\005*\207\001\n\rFragmentState\022\013\n\007SENDING\020\000"
"\022\027\n\023AWAITING_ALLOCATION\020\001\022\013\n\007RUNNING\020\002\022\014"
"\n\010FINISHED\020\003\022\r\n\tCANCELLED\020\004\022\n\n\006FAILED\020\005\022"
- "\032\n\026CANCELLATION_REQUESTED\020\006*\236\013\n\020CoreOper"
+ "\032\n\026CANCELLATION_REQUESTED\020\006*\260\013\n\020CoreOper"
"atorType\022\021\n\rSINGLE_SENDER\020\000\022\024\n\020BROADCAST"
"_SENDER\020\001\022\n\n\006FILTER\020\002\022\022\n\016HASH_AGGREGATE\020"
"\003\022\r\n\tHASH_JOIN\020\004\022\016\n\nMERGE_JOIN\020\005\022\031\n\025HASH"
@@ -992,11 +992,11 @@ const char
descriptor_table_protodef_UserBitShared_2eproto[] PROTOBUF_SECTION_VA
"CAN\020\?\022\022\n\016EXCEL_SUB_SCAN\020@\022\020\n\014SHP_SUB_SCA"
"N\020A\022\024\n\020METADATA_HANDLER\020B\022\027\n\023METADATA_CO"
"NTROLLER\020C\022\022\n\016DRUID_SUB_SCAN\020D\022\021\n\rSPSS_S"
- "UB_SCAN\020E\022\021\n\rHTTP_SUB_SCAN\020F*g\n\nSaslStat"
-
"us\022\020\n\014SASL_UNKNOWN\020\000\022\016\n\nSASL_START\020\001\022\024\n\020"
- "SASL_IN_PROGRESS\020\002\022\020\n\014SASL_SUCCESS\020\003\022\017\n\013"
- "SASL_FAILED\020\004B.\n\033org.apache.drill.exec.p"
- "rotoB\rUserBitSharedH\001"
+ "UB_SCAN\020E\022\021\n\rHTTP_SUB_SCAN\020F\022\020\n\014XML_SUB_"
+ "SCAN\020G*g\n\nSaslStatus\022\020\n\014SASL_UNKNOWN\020\000\022\016"
+ "\n\nSASL_START\020\001\022\024\n\020SASL_IN_PROGRESS\020\002\022\020\n\014"
+ "SASL_SUCCESS\020\003\022\017\n\013SASL_FAILED\020\004B.\n\033org.a"
+ "pache.drill.exec.protoB\rUserBitSharedH\001"
;
static const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable*const
descriptor_table_UserBitShared_2eproto_deps[3] = {
&::descriptor_table_Coordination_2eproto,
@@ -1030,7 +1030,7 @@ static
::PROTOBUF_NAMESPACE_ID::internal::SCCInfoBase*const descriptor_table_Use
static ::PROTOBUF_NAMESPACE_ID::internal::once_flag
descriptor_table_UserBitShared_2eproto_once;
static bool descriptor_table_UserBitShared_2eproto_initialized = false;
const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable
descriptor_table_UserBitShared_2eproto = {
- &descriptor_table_UserBitShared_2eproto_initialized,
descriptor_table_protodef_UserBitShared_2eproto, "UserBitShared.proto", 5821,
+ &descriptor_table_UserBitShared_2eproto_initialized,
descriptor_table_protodef_UserBitShared_2eproto, "UserBitShared.proto", 5839,
&descriptor_table_UserBitShared_2eproto_once,
descriptor_table_UserBitShared_2eproto_sccs,
descriptor_table_UserBitShared_2eproto_deps, 22, 3,
schemas, file_default_instances, TableStruct_UserBitShared_2eproto::offsets,
file_level_metadata_UserBitShared_2eproto, 22,
file_level_enum_descriptors_UserBitShared_2eproto,
file_level_service_descriptors_UserBitShared_2eproto,
@@ -1269,6 +1269,7 @@ bool CoreOperatorType_IsValid(int value) {
case 68:
case 69:
case 70:
+ case 71:
return true;
default:
return false;
diff --git a/contrib/native/client/src/protobuf/UserBitShared.pb.h
b/contrib/native/client/src/protobuf/UserBitShared.pb.h
index ae87641..c9afda2 100644
--- a/contrib/native/client/src/protobuf/UserBitShared.pb.h
+++ b/contrib/native/client/src/protobuf/UserBitShared.pb.h
@@ -392,11 +392,12 @@ enum CoreOperatorType : int {
METADATA_CONTROLLER = 67,
DRUID_SUB_SCAN = 68,
SPSS_SUB_SCAN = 69,
- HTTP_SUB_SCAN = 70
+ HTTP_SUB_SCAN = 70,
+ XML_SUB_SCAN = 71
};
bool CoreOperatorType_IsValid(int value);
constexpr CoreOperatorType CoreOperatorType_MIN = SINGLE_SENDER;
-constexpr CoreOperatorType CoreOperatorType_MAX = HTTP_SUB_SCAN;
+constexpr CoreOperatorType CoreOperatorType_MAX = XML_SUB_SCAN;
constexpr int CoreOperatorType_ARRAYSIZE = CoreOperatorType_MAX + 1;
const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* CoreOperatorType_descriptor();
diff --git a/contrib/pom.xml b/contrib/pom.xml
index f5f60ee..d9ea07a 100644
--- a/contrib/pom.xml
+++ b/contrib/pom.xml
@@ -50,6 +50,7 @@
<module>format-esri</module>
<module>format-hdf5</module>
<module>format-spss</module>
+ <module>format-xml</module>
<module>storage-hive</module>
<module>storage-mongo</module>
<module>storage-jdbc</module>
diff --git a/distribution/pom.xml b/distribution/pom.xml
index c6ebecb..8490919 100644
--- a/distribution/pom.xml
+++ b/distribution/pom.xml
@@ -364,6 +364,11 @@
</dependency>
<dependency>
<groupId>org.apache.drill.contrib</groupId>
+ <artifactId>drill-format-xml</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.drill.contrib</groupId>
<artifactId>drill-format-esri</artifactId>
<version>${project.version}</version>
</dependency>
diff --git a/distribution/src/assemble/component.xml
b/distribution/src/assemble/component.xml
index 2148fb8..a7670e3 100644
--- a/distribution/src/assemble/component.xml
+++ b/distribution/src/assemble/component.xml
@@ -44,6 +44,7 @@
<include>org.apache.drill.contrib:drill-format-mapr:jar</include>
<include>org.apache.drill.contrib:drill-format-syslog:jar</include>
<include>org.apache.drill.contrib:drill-format-esri:jar</include>
+ <include>org.apache.drill.contrib:drill-format-xml:jar</include>
<include>org.apache.drill.contrib:drill-format-hdf5:jar</include>
<include>org.apache.drill.contrib:drill-format-ltsv:jar</include>
<include>org.apache.drill.contrib:drill-format-httpd:jar</include>
diff --git
a/protocol/src/main/java/org/apache/drill/exec/proto/UserBitShared.java
b/protocol/src/main/java/org/apache/drill/exec/proto/UserBitShared.java
index 01a51f0..4292ec1 100644
--- a/protocol/src/main/java/org/apache/drill/exec/proto/UserBitShared.java
+++ b/protocol/src/main/java/org/apache/drill/exec/proto/UserBitShared.java
@@ -697,6 +697,10 @@ public final class UserBitShared {
* <code>HTTP_SUB_SCAN = 70;</code>
*/
HTTP_SUB_SCAN(70),
+ /**
+ * <code>XML_SUB_SCAN = 71;</code>
+ */
+ XML_SUB_SCAN(71),
;
/**
@@ -983,6 +987,10 @@ public final class UserBitShared {
* <code>HTTP_SUB_SCAN = 70;</code>
*/
public static final int HTTP_SUB_SCAN_VALUE = 70;
+ /**
+ * <code>XML_SUB_SCAN = 71;</code>
+ */
+ public static final int XML_SUB_SCAN_VALUE = 71;
public final int getNumber() {
@@ -1076,6 +1084,7 @@ public final class UserBitShared {
case 68: return DRUID_SUB_SCAN;
case 69: return SPSS_SUB_SCAN;
case 70: return HTTP_SUB_SCAN;
+ case 71: return XML_SUB_SCAN;
default: return null;
}
}
@@ -29055,7 +29064,7 @@ public final class UserBitShared {
"ATEMENT\020\005*\207\001\n\rFragmentState\022\013\n\007SENDING\020\000"
+
"\022\027\n\023AWAITING_ALLOCATION\020\001\022\013\n\007RUNNING\020\002\022\014"
+
"\n\010FINISHED\020\003\022\r\n\tCANCELLED\020\004\022\n\n\006FAILED\020\005\022"
+
- "\032\n\026CANCELLATION_REQUESTED\020\006*\236\013\n\020CoreOper" +
+ "\032\n\026CANCELLATION_REQUESTED\020\006*\260\013\n\020CoreOper" +
"atorType\022\021\n\rSINGLE_SENDER\020\000\022\024\n\020BROADCAST" +
"_SENDER\020\001\022\n\n\006FILTER\020\002\022\022\n\016HASH_AGGREGATE\020" +
"\003\022\r\n\tHASH_JOIN\020\004\022\016\n\nMERGE_JOIN\020\005\022\031\n\025HASH"
+
@@ -29091,11 +29100,11 @@ public final class UserBitShared {
"CAN\020?\022\022\n\016EXCEL_SUB_SCAN\020@\022\020\n\014SHP_SUB_SCA" +
"N\020A\022\024\n\020METADATA_HANDLER\020B\022\027\n\023METADATA_CO" +
"NTROLLER\020C\022\022\n\016DRUID_SUB_SCAN\020D\022\021\n\rSPSS_S" +
- "UB_SCAN\020E\022\021\n\rHTTP_SUB_SCAN\020F*g\n\nSaslStat" +
-
"us\022\020\n\014SASL_UNKNOWN\020\000\022\016\n\nSASL_START\020\001\022\024\n\020"
+
-
"SASL_IN_PROGRESS\020\002\022\020\n\014SASL_SUCCESS\020\003\022\017\n\013" +
- "SASL_FAILED\020\004B.\n\033org.apache.drill.exec.p" +
- "rotoB\rUserBitSharedH\001"
+ "UB_SCAN\020E\022\021\n\rHTTP_SUB_SCAN\020F\022\020\n\014XML_SUB_" +
+ "SCAN\020G*g\n\nSaslStatus\022\020\n\014SASL_UNKNOWN\020\000\022\016" +
+
"\n\nSASL_START\020\001\022\024\n\020SASL_IN_PROGRESS\020\002\022\020\n\014" +
+ "SASL_SUCCESS\020\003\022\017\n\013SASL_FAILED\020\004B.\n\033org.a" +
+ "pache.drill.exec.protoB\rUserBitSharedH\001"
};
descriptor = com.google.protobuf.Descriptors.FileDescriptor
.internalBuildGeneratedFileFrom(descriptorData,
diff --git a/protocol/src/main/protobuf/UserBitShared.proto
b/protocol/src/main/protobuf/UserBitShared.proto
index f7b7b02..c2304cf 100644
--- a/protocol/src/main/protobuf/UserBitShared.proto
+++ b/protocol/src/main/protobuf/UserBitShared.proto
@@ -382,6 +382,7 @@ enum CoreOperatorType {
DRUID_SUB_SCAN = 68;
SPSS_SUB_SCAN = 69;
HTTP_SUB_SCAN = 70;
+ XML_SUB_SCAN = 71;
}
/* Registry that contains list of jars, each jar contains its name and list of
function signatures.