This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new e8bb5a1 DRILL-7835: XML Reader Not Reading XML Nested Attributes
Correctly (#2134)
e8bb5a1 is described below
commit e8bb5a174d9e18ed69e5f4015177bd86024804a9
Author: Charles S. Givre <[email protected]>
AuthorDate: Tue Jan 5 11:04:54 2021 -0500
DRILL-7835: XML Reader Not Reading XML Nested Attributes Correctly (#2134)
* Initial Commit
* Doc updates
---
contrib/format-xml/README.md | 6 +-
.../org/apache/drill/exec/store/xml/XMLReader.java | 26 +++++++-
.../apache/drill/exec/store/xml/TestXMLReader.java | 73 ++++++++++++++++++++++
.../test/resources/xml/nested-with-attributes.xml | 63 +++++++++++++++++++
.../src/test/resources/xml/very-nested.xml | 38 -----------
5 files changed, 165 insertions(+), 41 deletions(-)
diff --git a/contrib/format-xml/README.md b/contrib/format-xml/README.md
index dc245a3..3c50ce2 100644
--- a/contrib/format-xml/README.md
+++ b/contrib/format-xml/README.md
@@ -1,5 +1,5 @@
# XML Format Reader
-This plugin enables Drill to read XML files without defining any kind of
schema.
+This plugin enables Drill to read XML files without defining any kind of
schema.
## Configuration
Aside from the file extension, there is one configuration option:
@@ -80,7 +80,9 @@ apache drill> select * from dfs.test.`attributes.xml`;
+-----------------------------------------------------------------+------------+---------------------------------+-------------+------+-----------------------------------------+
```
-
+## Limitations: Malformed XML
+Drill can read properly formatted XML. If the XML is not properly formatted,
Drill will throw errors. Some issues include illegal characters in field names,
or attribute names.
+Future functionality will include some degree of data cleaning and fault
tolerance.
## Limitations: Schema Ambiguity
XML is a challenging format to process as the structure does not give any
hints about the schema. For example, a JSON file might have the following
record:
diff --git
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
index 7665e6a..8fc0343 100644
---
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
+++
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
@@ -365,6 +365,30 @@ public class XMLReader {
}
/**
+ * Writes a attribute. If the field does not have a corresponding
ScalarWriter, this method will
+ * create one.
+ * @param fieldName The field name
+ * @param fieldValue The field value to be written
+ * @param writer The TupleWriter which represents
+ */
+ private void writeAttributeData(String fieldName, String fieldValue,
TupleWriter writer) {
+ if (fieldName == null) {
+ return;
+ }
+
+ // Find the TupleWriter object
+ int index = writer.tupleSchema().index(fieldName);
+ if (index == -1) {
+ ColumnMetadata colSchema = MetadataUtils.newScalar(fieldName,
TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL);
+ index = writer.addColumn(colSchema);
+ }
+ ScalarWriter colWriter = writer.scalar(index);
+ if (fieldValue != null) {
+ colWriter.setString(fieldValue);
+ }
+ }
+
+ /**
* Returns a MapWriter for a given field. If the writer does not exist, add
one to the schema
* @param mapName The Map's name
* @param rowWriter The current TupleWriter
@@ -409,7 +433,7 @@ public class XMLReader {
while (attributes.hasNext()) {
Attribute currentAttribute = attributes.next();
String key = prefix + "_" + currentAttribute.getName().toString();
- writeFieldData(key, currentAttribute.getValue(), attributeWriter);
+ writeAttributeData(key, currentAttribute.getValue(), attributeWriter);
}
}
diff --git
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
index b515dab..522f89c 100644
---
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
+++
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
@@ -416,6 +416,79 @@ public class TestXMLReader extends ClusterTest {
}
@Test
+ public void testNestedAttributes() throws Exception {
+ String sql = "SELECT * FROM cp.`xml/nested-with-attributes.xml`";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addMap("attributes")
+ .addNullable("field1_f1", MinorType.VARCHAR)
+ .addNullable("field2_f2", MinorType.VARCHAR)
+ .addNullable("field2_key3_f3", MinorType.VARCHAR)
+ .addNullable("field2_nestedField1_f4", MinorType.VARCHAR)
+ .addNullable("field2_nestedField1_f5", MinorType.VARCHAR)
+ .addNullable("field2_nestedField1_nk1_f6", MinorType.VARCHAR)
+ .addNullable("field2_nestedField1_nk1_f7", MinorType.VARCHAR)
+ .addNullable("field2_nestedField1_nk3_f8", MinorType.VARCHAR)
+ .resumeSchema()
+ .addMap("field1")
+ .addNullable("key1", MinorType.VARCHAR)
+ .addNullable("key2", MinorType.VARCHAR)
+ .resumeSchema()
+ .addMap("field2")
+ .addNullable("key3", MinorType.VARCHAR)
+ .addMap("nestedField1")
+ .addNullable("nk1", MinorType.VARCHAR)
+ .addNullable("nk2", MinorType.VARCHAR)
+ .addNullable("nk3", MinorType.VARCHAR)
+ .resumeMap()
+ .resumeSchema()
+ .buildSchema();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow(strArray("k1", "k2", "k3", "k4", "k5", "k6", "k7", null),
strArray("value1", "value2"), objArray("k1", strArray("nk_value1", "nk_value2",
"nk_value3")))
+ .addRow(strArray(null, null, null, null, null, null, null, null),
strArray("value3", "value4"), objArray("k2", strArray("nk_value4", "nk_value5",
"nk_value6")))
+ .addRow(strArray(null, null, null, null, null, null, null, "k8"),
strArray("value5", "value6"), objArray("k3", strArray("nk_value7", "nk_value8",
"nk_value9")))
+ .build();
+
+ assertEquals(3, results.rowCount());
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testExplicitNestedAttributes() throws Exception {
+ String sql = "SELECT data.attributes.field1_f1 AS field1_f1," +
+ "data.attributes.field2_f2 AS field2_f2, " +
+ "data.attributes.field2_key3_f3 AS field2_key3_f3," +
+ "data.attributes.field2_nestedField1_f4 AS field2_nestedField1_f4," +
+ "data.attributes.field2_nestedField1_f5 AS field2_nestedField1_f5, " +
+ "data.attributes.field2_nestedField1_nk1_f6 AS
field2_nestedField1_nk1_f6, " +
+ "data.attributes.field2_nestedField1_nk1_f7 AS
field2_nestedField1_nk1_f7," +
+ "data.attributes.field2_nestedField1_nk3_f8 AS
field2_nestedField1_nk3_f8 " +
+ "FROM cp.`xml/nested-with-attributes.xml` AS data";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("field1_f1", MinorType.VARCHAR)
+ .addNullable("field2_f2", MinorType.VARCHAR)
+ .addNullable("field2_key3_f3", MinorType.VARCHAR)
+ .addNullable("field2_nestedField1_f4", MinorType.VARCHAR)
+ .addNullable("field2_nestedField1_f5", MinorType.VARCHAR)
+ .addNullable("field2_nestedField1_nk1_f6", MinorType.VARCHAR)
+ .addNullable("field2_nestedField1_nk1_f7", MinorType.VARCHAR)
+ .addNullable("field2_nestedField1_nk3_f8", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = client.rowSetBuilder(expectedSchema)
+ .addRow("k1", "k2", "k3", "k4", "k5", "k6", "k7", null)
+ .addRow(null, null, null, null, null, null, null, null)
+ .addRow(null, null, null, null, null, null, null, "k8")
+ .build();
+ assertEquals(3, results.rowCount());
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
public void testLimitPushdown() throws Exception {
String sql = "SELECT * FROM cp.`xml/simple.xml` LIMIT 2";
diff --git
a/contrib/format-xml/src/test/resources/xml/nested-with-attributes.xml
b/contrib/format-xml/src/test/resources/xml/nested-with-attributes.xml
new file mode 100644
index 0000000..c09ae90
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/nested-with-attributes.xml
@@ -0,0 +1,63 @@
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<books>
+ <book>
+ <field1 f1="k1">
+ <key1>value1</key1>
+ <key2>value2</key2>
+ </field1>
+ <field2 f2="k2">
+ <key3 f3="k3">k1</key3>
+ <nestedField1 f4="k4" f5="k5">
+ <nk1 f6="k6" f7="k7">nk_value1</nk1>
+ <nk2>nk_value2</nk2>
+ <nk3>nk_value3</nk3>
+ </nestedField1>
+ </field2>
+ </book>
+ <book>
+ <field1>
+ <key1>value3</key1>
+ <key2>value4</key2>
+ </field1>
+ <field2>
+ <key3>k2</key3>
+ <nestedField1>
+ <nk1>nk_value4</nk1>
+ <nk2>nk_value5</nk2>
+ <nk3>nk_value6</nk3>
+ </nestedField1>
+ </field2>
+ </book>
+ <book>
+ <field1>
+ <key1>value5</key1>
+ <key2>value6</key2>
+ </field1>
+ <field2>
+ <key3>k3</key3>
+ <nestedField1>
+ <nk1>nk_value7</nk1>
+ <nk2>nk_value8</nk2>
+ <nk3 f8="k8">nk_value9</nk3>
+ </nestedField1>
+ </field2>
+ </book>
+</books>
\ No newline at end of file
diff --git a/contrib/format-xml/src/test/resources/xml/very-nested.xml
b/contrib/format-xml/src/test/resources/xml/very-nested.xml
deleted file mode 100644
index bdf8a0c..0000000
--- a/contrib/format-xml/src/test/resources/xml/very-nested.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<!--
-
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
--->
-<book>
- <field1>
- <key1>value1</key1>
- <key2>value2</key2>
- </field1>
- <field2>
- <key3>k1</key3>
- <nestedField1>
- <nk1>nk_value1</nk1>
- <nk2>nk_value2</nk2>
- <nk3>nk_value3</nk3>
- <nestedField2>
- <nk1>nk2_value1</nk1>
- <nk2>nk2_value2</nk2>
- <nk3>nk2_value3</nk3>
- </nestedField2>
- </nestedField1>
- </field2>
-</book>
\ No newline at end of file