[drill] 02/03: Document XML format plugin.

dzamo Wed, 06 Jan 2021 07:10:26 -0800

This is an automated email from the ASF dual-hosted git repository.

dzamo pushed a commit to branch gh-pages
in repository https://gitbox.apache.org/repos/asf/drill.git


commit 8cd3f353b4b2d454e047635a1ec5842b259d83de
Author: James Turton <[email protected]>
AuthorDate: Wed Jan 6 16:53:17 2021 +0200

    Document XML format plugin.
---
 _data/docs.json                                    |  71 ++++++++--
 .../120-xml-format-plugin.md                       | 149 +++++++++++++++++++++
 2 files changed, 210 insertions(+), 10 deletions(-)

diff --git a/_data/docs.json b/_data/docs.json
index fd8b73d..f764bcf 100644
--- a/_data/docs.json
+++ b/_data/docs.json
@@ -3589,14 +3589,31 @@
                         }
                     ],
                     "children": [],
-                    "next_title": "Develop Custom Functions",
-                    "next_url": "/docs/develop-custom-functions/",
+                    "next_title": "XML Format Plugin",
+                    "next_url": "/docs/xml-format-plugin/",
                     "parent": "Data Sources and File Formats",
                     "previous_title": "Excel Format Plugin",
                     "previous_url": "/docs/excel-format-plugin/",
                     "relative_path": 
"_docs/data-sources-and-file-formats/117-hdf5-format-plugin.md",
                     "title": "HDF5 Format Plugin",
                     "url": "/docs/hdf5-format-plugin/"
+                },
+                {
+                    "breadcrumbs": [
+                        {
+                            "title": "Data Sources and File Formats",
+                            "url": "/docs/data-sources-and-file-formats/"
+                        }
+                    ],
+                    "children": [],
+                    "next_title": "Develop Custom Functions",
+                    "next_url": "/docs/develop-custom-functions/",
+                    "parent": "Data Sources and File Formats",
+                    "previous_title": "HDF5 Format Plugin",
+                    "previous_url": "/docs/hdf5-format-plugin/",
+                    "relative_path": 
"_docs/data-sources-and-file-formats/120-xml-format-plugin.md",
+                    "title": "XML Format Plugin",
+                    "url": "/docs/xml-format-plugin/"
                 }
             ],
             "next_title": "Data Sources and File Formats Introduction",
@@ -4162,8 +4179,8 @@
             "next_title": "Develop Custom Functions Introduction",
             "next_url": "/docs/develop-custom-functions-introduction/",
             "parent": "",
-            "previous_title": "HDF5 Format Plugin",
-            "previous_url": "/docs/hdf5-format-plugin/",
+            "previous_title": "XML Format Plugin",
+            "previous_url": "/docs/xml-format-plugin/",
             "relative_path": "_docs/100-develop-custom-functions.md",
             "title": "Develop Custom Functions",
             "url": "/docs/develop-custom-functions/"
@@ -5577,8 +5594,8 @@
                 }
             ],
             "children": [],
-            "next_title": "Develop Custom Functions",
-            "next_url": "/docs/develop-custom-functions/",
+            "next_title": "XML Format Plugin",
+            "next_url": "/docs/xml-format-plugin/",
             "parent": "Data Sources and File Formats",
             "previous_title": "Excel Format Plugin",
             "previous_url": "/docs/excel-format-plugin/",
@@ -16852,6 +16869,23 @@
             "relative_path": "_docs/connect-a-data-source/050-workspaces.md",
             "title": "Workspaces",
             "url": "/docs/workspaces/"
+        },
+        "XML Format Plugin": {
+            "breadcrumbs": [
+                {
+                    "title": "Data Sources and File Formats",
+                    "url": "/docs/data-sources-and-file-formats/"
+                }
+            ],
+            "children": [],
+            "next_title": "Develop Custom Functions",
+            "next_url": "/docs/develop-custom-functions/",
+            "parent": "Data Sources and File Formats",
+            "previous_title": "HDF5 Format Plugin",
+            "previous_url": "/docs/hdf5-format-plugin/",
+            "relative_path": 
"_docs/data-sources-and-file-formats/120-xml-format-plugin.md",
+            "title": "XML Format Plugin",
+            "url": "/docs/xml-format-plugin/"
         }
     },
     "hierarchy": [
@@ -22404,14 +22438,31 @@
                         }
                     ],
                     "children": [],
-                    "next_title": "Develop Custom Functions",
-                    "next_url": "/docs/develop-custom-functions/",
+                    "next_title": "XML Format Plugin",
+                    "next_url": "/docs/xml-format-plugin/",
                     "parent": "Data Sources and File Formats",
                     "previous_title": "Excel Format Plugin",
                     "previous_url": "/docs/excel-format-plugin/",
                     "relative_path": 
"_docs/data-sources-and-file-formats/117-hdf5-format-plugin.md",
                     "title": "HDF5 Format Plugin",
                     "url": "/docs/hdf5-format-plugin/"
+                },
+                {
+                    "breadcrumbs": [
+                        {
+                            "title": "Data Sources and File Formats",
+                            "url": "/docs/data-sources-and-file-formats/"
+                        }
+                    ],
+                    "children": [],
+                    "next_title": "Develop Custom Functions",
+                    "next_url": "/docs/develop-custom-functions/",
+                    "parent": "Data Sources and File Formats",
+                    "previous_title": "HDF5 Format Plugin",
+                    "previous_url": "/docs/hdf5-format-plugin/",
+                    "relative_path": 
"_docs/data-sources-and-file-formats/120-xml-format-plugin.md",
+                    "title": "XML Format Plugin",
+                    "url": "/docs/xml-format-plugin/"
                 }
             ],
             "next_title": "Data Sources and File Formats Introduction",
@@ -22634,8 +22685,8 @@
             "next_title": "Develop Custom Functions Introduction",
             "next_url": "/docs/develop-custom-functions-introduction/",
             "parent": "",
-            "previous_title": "HDF5 Format Plugin",
-            "previous_url": "/docs/hdf5-format-plugin/",
+            "previous_title": "XML Format Plugin",
+            "previous_url": "/docs/xml-format-plugin/",
             "relative_path": "_docs/100-develop-custom-functions.md",
             "title": "Develop Custom Functions",
             "url": "/docs/develop-custom-functions/"
diff --git a/_docs/data-sources-and-file-formats/120-xml-format-plugin.md 
b/_docs/data-sources-and-file-formats/120-xml-format-plugin.md
new file mode 100644
index 0000000..d64e1e8
--- /dev/null
+++ b/_docs/data-sources-and-file-formats/120-xml-format-plugin.md
@@ -0,0 +1,149 @@
+---
+title: "XML Format Plugin"
+slug: "XML Format Plugin"
+parent: "Data Sources and File Formats"
+---
+
+**Introduced in release:** 1.19
+
+This plugin enables Drill to read XML files without defining any kind of 
schema. 
+
+## Configuration
+Aside from the file extension, there is one configuration option:
+
+* `dataLevel`: XML data often contains a considerable amount of nesting which 
is not necesarily useful for data analysis. This parameter allows you to set 
the nesting level 
+  where the data actually starts.  The levels start at `1`.
+
+The default configuration is shown below:
+
+```json
+"xml": {
+  "type": "xml",
+  "extensions": [
+    "xml"
+  ],
+  "dataLevel": 2
+}
+```
+
+## Data Types
+All fields are read as strings.  Nested fields are read as maps.  Future 
functionality could include
+support for lists.
+
+### Attributes
+XML events can have attributes which can also be useful.
+```xml
+<book>
+  <author>O.-J. Dahl</author>
+  <title binding="hardcover" subcategory="non-fiction">Structured 
Programming</title>
+  <category>PROGRAMMING</category>
+  <year>1972</year>
+</book>
+```
+
+In the example above, the `title` field contains two attributes, the `binding` 
and `subcategory`.
+In order to access these fields, Drill creates a map called `attributes` and 
adds an entry for each
+attribute with the field name and then the attribute name.  Every XML file 
will have a field called
+`atttributes` regardless of whether the data actually has attributes or not.
+
+```xml
+<books>
+   <book>
+     <author>Mark Twain</author>
+     <title>The Adventures of Tom Sawyer</title>
+     <category>FICTION</category>
+     <year>1876</year>
+   </book>
+   <book>
+     <authors>
+         <author>Niklaus Wirth</author>
+         <author>Somebody else</author>
+     </authors>
+     <title binding="paperback">The Programming Language Pascal</title>
+     <category >PASCAL</category>
+     <year>1971</year>
+   </book>
+   <book>
+     <author>O.-J. Dahl</author>
+     <title binding="hardcover" subcategory="non-fiction">Structured 
Programming</title>
+     <category>PROGRAMMING</category>
+     <year>1972</year>
+   </book>
+ </books>
+```
+If you queried this data in Drill you'd get the table below:
+
+```sql
+SELECT * 
+FROM <path>.`attributes.xml`
+```
+
+```
+apache drill> select * from dfs.test.`attributes.xml`;
+|-----------------------------------------------------------------|------------|---------------------------------|-------------|------|-----------------------------------------|
+| attributes                                                      | author     
| title                           | category    | year | authors                
                 |
+|-----------------------------------------------------------------|------------|---------------------------------|-------------|------|-----------------------------------------|
+| {}                                                              | Mark Twain 
| The Adventures of Tom Sawyer    | FICTION     | 1876 | {}                     
                 |
+| {"title_binding":"paperback"}                                   | null       
| The Programming Language Pascal | PASCAL      | 1971 | {"author":"Niklaus 
WirthSomebody else"} |
+| {"title_binding":"hardcover","title_subcategory":"non-fiction"} | O.-J. Dahl 
| Structured Programming          | PROGRAMMING | 1972 | {}                     
                 |
+|-----------------------------------------------------------------|------------|---------------------------------|-------------|------|-----------------------------------------|
+```
+
+## Limitations:  Malformed XML
+Drill can read properly formatted XML.  If the XML is not properly formatted, 
Drill will throw errors. Some issues include illegal characters in field names, 
or attribute names.
+Future functionality will include some degree of data cleaning and fault 
tolerance. 
+
+## Limitations: Schema Ambiguity
+XML is a challenging format to process as the structure does not give any 
hints about the schema.  For example, a JSON file might have the following 
record:
+
+```json
+"record" : {
+  "intField:" : 1,
+  "listField" : [1, 2],
+  "otherField" : {
+    "nestedField1" : "foo",
+    "nestedField2" : "bar"
+  }
+}
+```
+
+From this data, it is clear that `listField` is a `list` and `otherField` is a 
map.  This same data could be represented in XML as follows:
+
+```xml
+<record>
+  <intField>1</intField>
+  <listField>
+    <value>1</value>
+    <value>2</value>
+  </listField>
+  <otherField>
+    <nestedField1>foo</nestedField1>
+    <nestedField2>bar</nestedField2>
+  </otherField>
+</record>
+```
+
+This is no problem to parse this data. But consider what would happen if we 
encountered the following first:
+```xml
+<record>
+  <intField>1</intField>
+  <listField>
+    <value>2</value>
+  </listField>
+  <otherField>
+    <nestedField1>foo</nestedField1>
+    <nestedField2>bar</nestedField2>
+  </otherField>
+</record>
+```
+
+In this example, there is no way for Drill to know whether `listField` is a 
`list` or a `map`
+because it only has one entry. 
+
+## Future Functionality
+
+* **Build schema from XSD file or link**:  One of the major challenges of this 
reader is having to infer the schema of the data. XML files do provide a schema 
although this is not required.  In the future, if there is interest, we can 
extend this reader to use an XSD file to build the schema which will be used to 
parse the actual XML file. 
+  
+* **Infer Date Fields**: It may be possible to add the ability to infer data 
fields.
+
+* **List Support**:  Future functionality may include the ability to infer 
lists from data structures.

[drill] 02/03: Document XML format plugin.

Reply via email to