This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new e8bb5a1  DRILL-7835: XML Reader Not Reading XML Nested Attributes 
Correctly (#2134)
e8bb5a1 is described below

commit e8bb5a174d9e18ed69e5f4015177bd86024804a9
Author: Charles S. Givre <[email protected]>
AuthorDate: Tue Jan 5 11:04:54 2021 -0500

    DRILL-7835: XML Reader Not Reading XML Nested Attributes Correctly (#2134)
    
    * Initial Commit
    
    * Doc updates
---
 contrib/format-xml/README.md                       |  6 +-
 .../org/apache/drill/exec/store/xml/XMLReader.java | 26 +++++++-
 .../apache/drill/exec/store/xml/TestXMLReader.java | 73 ++++++++++++++++++++++
 .../test/resources/xml/nested-with-attributes.xml  | 63 +++++++++++++++++++
 .../src/test/resources/xml/very-nested.xml         | 38 -----------
 5 files changed, 165 insertions(+), 41 deletions(-)

diff --git a/contrib/format-xml/README.md b/contrib/format-xml/README.md
index dc245a3..3c50ce2 100644
--- a/contrib/format-xml/README.md
+++ b/contrib/format-xml/README.md
@@ -1,5 +1,5 @@
 # XML Format Reader
-This plugin enables Drill to read XML files without defining any kind of 
schema.
+This plugin enables Drill to read XML files without defining any kind of 
schema. 
 
 ## Configuration
 Aside from the file extension, there is one configuration option:
@@ -80,7 +80,9 @@ apache drill> select * from dfs.test.`attributes.xml`;
 
+-----------------------------------------------------------------+------------+---------------------------------+-------------+------+-----------------------------------------+
 ```
 
-
+## Limitations:  Malformed XML
+Drill can read properly formatted XML.  If the XML is not properly formatted, 
Drill will throw errors. Some issues include illegal characters in field names, 
or attribute names.
+Future functionality will include some degree of data cleaning and fault 
tolerance. 
 
 ## Limitations: Schema Ambiguity
 XML is a challenging format to process as the structure does not give any 
hints about the schema.  For example, a JSON file might have the following 
record:
diff --git 
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
 
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
index 7665e6a..8fc0343 100644
--- 
a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
+++ 
b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
@@ -365,6 +365,30 @@ public class XMLReader {
   }
 
   /**
+   * Writes a attribute. If the field does not have a corresponding 
ScalarWriter, this method will
+   * create one.
+   * @param fieldName The field name
+   * @param fieldValue The field value to be written
+   * @param writer The TupleWriter which represents
+   */
+  private void writeAttributeData(String fieldName, String fieldValue, 
TupleWriter writer) {
+    if (fieldName == null) {
+      return;
+    }
+
+    // Find the TupleWriter object
+    int index = writer.tupleSchema().index(fieldName);
+    if (index == -1) {
+      ColumnMetadata colSchema = MetadataUtils.newScalar(fieldName, 
TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL);
+      index = writer.addColumn(colSchema);
+    }
+    ScalarWriter colWriter = writer.scalar(index);
+    if (fieldValue != null) {
+      colWriter.setString(fieldValue);
+    }
+  }
+
+  /**
    * Returns a MapWriter for a given field.  If the writer does not exist, add 
one to the schema
    * @param mapName The Map's name
    * @param rowWriter The current TupleWriter
@@ -409,7 +433,7 @@ public class XMLReader {
     while (attributes.hasNext()) {
       Attribute currentAttribute = attributes.next();
       String key = prefix + "_" + currentAttribute.getName().toString();
-      writeFieldData(key, currentAttribute.getValue(), attributeWriter);
+      writeAttributeData(key, currentAttribute.getValue(), attributeWriter);
     }
   }
 
diff --git 
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
 
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
index b515dab..522f89c 100644
--- 
a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
+++ 
b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
@@ -416,6 +416,79 @@ public class TestXMLReader extends ClusterTest {
   }
 
   @Test
+  public void testNestedAttributes() throws Exception {
+    String sql = "SELECT * FROM cp.`xml/nested-with-attributes.xml`";
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addMap("attributes")
+        .addNullable("field1_f1", MinorType.VARCHAR)
+        .addNullable("field2_f2", MinorType.VARCHAR)
+        .addNullable("field2_key3_f3", MinorType.VARCHAR)
+        .addNullable("field2_nestedField1_f4", MinorType.VARCHAR)
+        .addNullable("field2_nestedField1_f5", MinorType.VARCHAR)
+        .addNullable("field2_nestedField1_nk1_f6", MinorType.VARCHAR)
+        .addNullable("field2_nestedField1_nk1_f7", MinorType.VARCHAR)
+        .addNullable("field2_nestedField1_nk3_f8", MinorType.VARCHAR)
+      .resumeSchema()
+      .addMap("field1")
+      .addNullable("key1", MinorType.VARCHAR)
+      .addNullable("key2", MinorType.VARCHAR)
+      .resumeSchema()
+      .addMap("field2")
+      .addNullable("key3", MinorType.VARCHAR)
+      .addMap("nestedField1")
+      .addNullable("nk1", MinorType.VARCHAR)
+      .addNullable("nk2", MinorType.VARCHAR)
+      .addNullable("nk3", MinorType.VARCHAR)
+      .resumeMap()
+      .resumeSchema()
+      .buildSchema();
+
+    RowSet expected = client.rowSetBuilder(expectedSchema)
+      .addRow(strArray("k1", "k2", "k3", "k4", "k5", "k6", "k7", null), 
strArray("value1", "value2"), objArray("k1", strArray("nk_value1", "nk_value2", 
"nk_value3")))
+      .addRow(strArray(null, null, null, null, null, null, null, null), 
strArray("value3", "value4"), objArray("k2", strArray("nk_value4", "nk_value5", 
"nk_value6")))
+      .addRow(strArray(null, null, null, null, null, null, null, "k8"), 
strArray("value5", "value6"), objArray("k3", strArray("nk_value7", "nk_value8", 
"nk_value9")))
+      .build();
+
+    assertEquals(3, results.rowCount());
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
+  @Test
+  public void testExplicitNestedAttributes() throws Exception {
+    String sql = "SELECT data.attributes.field1_f1 AS field1_f1," +
+      "data.attributes.field2_f2 AS field2_f2, " +
+      "data.attributes.field2_key3_f3 AS field2_key3_f3," +
+      "data.attributes.field2_nestedField1_f4 AS field2_nestedField1_f4," +
+      "data.attributes.field2_nestedField1_f5 AS field2_nestedField1_f5, " +
+      "data.attributes.field2_nestedField1_nk1_f6 AS 
field2_nestedField1_nk1_f6, " +
+      "data.attributes.field2_nestedField1_nk1_f7 AS 
field2_nestedField1_nk1_f7," +
+      "data.attributes.field2_nestedField1_nk3_f8 AS 
field2_nestedField1_nk3_f8 " +
+      "FROM cp.`xml/nested-with-attributes.xml` AS data";
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addNullable("field1_f1", MinorType.VARCHAR)
+      .addNullable("field2_f2", MinorType.VARCHAR)
+      .addNullable("field2_key3_f3", MinorType.VARCHAR)
+      .addNullable("field2_nestedField1_f4", MinorType.VARCHAR)
+      .addNullable("field2_nestedField1_f5", MinorType.VARCHAR)
+      .addNullable("field2_nestedField1_nk1_f6", MinorType.VARCHAR)
+      .addNullable("field2_nestedField1_nk1_f7", MinorType.VARCHAR)
+      .addNullable("field2_nestedField1_nk3_f8", MinorType.VARCHAR)
+      .buildSchema();
+
+    RowSet expected = client.rowSetBuilder(expectedSchema)
+      .addRow("k1", "k2", "k3", "k4", "k5", "k6", "k7", null)
+      .addRow(null, null, null, null, null, null, null, null)
+      .addRow(null, null, null, null, null, null, null, "k8")
+      .build();
+    assertEquals(3, results.rowCount());
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
+  @Test
   public void testLimitPushdown() throws Exception {
     String sql = "SELECT * FROM cp.`xml/simple.xml` LIMIT 2";
 
diff --git 
a/contrib/format-xml/src/test/resources/xml/nested-with-attributes.xml 
b/contrib/format-xml/src/test/resources/xml/nested-with-attributes.xml
new file mode 100644
index 0000000..c09ae90
--- /dev/null
+++ b/contrib/format-xml/src/test/resources/xml/nested-with-attributes.xml
@@ -0,0 +1,63 @@
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+-->
+<books>
+  <book>
+    <field1 f1="k1">
+      <key1>value1</key1>
+      <key2>value2</key2>
+    </field1>
+    <field2 f2="k2">
+      <key3 f3="k3">k1</key3>
+      <nestedField1 f4="k4" f5="k5">
+        <nk1 f6="k6" f7="k7">nk_value1</nk1>
+        <nk2>nk_value2</nk2>
+        <nk3>nk_value3</nk3>
+      </nestedField1>
+    </field2>
+  </book>
+  <book>
+    <field1>
+      <key1>value3</key1>
+      <key2>value4</key2>
+    </field1>
+    <field2>
+      <key3>k2</key3>
+      <nestedField1>
+        <nk1>nk_value4</nk1>
+        <nk2>nk_value5</nk2>
+        <nk3>nk_value6</nk3>
+      </nestedField1>
+    </field2>
+  </book>
+  <book>
+    <field1>
+      <key1>value5</key1>
+      <key2>value6</key2>
+    </field1>
+    <field2>
+      <key3>k3</key3>
+      <nestedField1>
+        <nk1>nk_value7</nk1>
+        <nk2>nk_value8</nk2>
+        <nk3 f8="k8">nk_value9</nk3>
+      </nestedField1>
+    </field2>
+  </book>
+</books>
\ No newline at end of file
diff --git a/contrib/format-xml/src/test/resources/xml/very-nested.xml 
b/contrib/format-xml/src/test/resources/xml/very-nested.xml
deleted file mode 100644
index bdf8a0c..0000000
--- a/contrib/format-xml/src/test/resources/xml/very-nested.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<!--
-
-    Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-
--->
-<book>
-  <field1>
-    <key1>value1</key1>
-    <key2>value2</key2>
-  </field1>
-  <field2>
-    <key3>k1</key3>
-    <nestedField1>
-      <nk1>nk_value1</nk1>
-      <nk2>nk_value2</nk2>
-      <nk3>nk_value3</nk3>
-      <nestedField2>
-        <nk1>nk2_value1</nk1>
-        <nk2>nk2_value2</nk2>
-        <nk3>nk2_value3</nk3>
-      </nestedField2>
-    </nestedField1>
-  </field2>
-</book>
\ No newline at end of file

Reply via email to