[nifi] 01/03: NIFI-9832: Fix disappearing XML element content when the element has attribute (#5896)

joewitt Tue, 12 Apr 2022 08:52:34 -0700

This is an automated email from the ASF dual-hosted git repository.

joewitt pushed a commit to branch support/nifi-1.16
in repository https://gitbox.apache.org/repos/asf/nifi.git


commit b9458c0f926b77b589d3263dff2517be3888b3a8
Author: Peter Gyori <[email protected]>
AuthorDate: Tue Apr 12 17:41:50 2022 +0200

    NIFI-9832: Fix disappearing XML element content when the element has 
attribute (#5896)
    
    - NIFI-9832: Additional test cases for XMLReader
---
 .../nifi-record-serialization-services/pom.xml     |   2 +
 .../main/java/org/apache/nifi/xml/XMLReader.java   |  12 +-
 .../java/org/apache/nifi/xml/XMLRecordReader.java  |  11 +-
 .../apache/nifi/xml/inference/XmlRecordSource.java |   6 +-
 .../additionalDetails.html                         | 351 +++++++++++++++++++++
 .../org/apache/nifi/xml/TestInferXmlSchema.java    |  11 +-
 .../java/org/apache/nifi/xml/TestXMLReader.java    | 218 ++++++++++---
 .../test/resources/xml/field_with_sub-element.xml  |   4 +
 .../src/test/resources/xml/person_record.xml       |   5 +
 9 files changed, 561 insertions(+), 59 deletions(-)

diff --git 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml
 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml
index e3ac2d4e2a..19b8997874 100755
--- 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml
+++ 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml
@@ -218,6 +218,7 @@
                         
<exclude>src/test/resources/syslog/syslog5424/log_mix.txt</exclude>
                         
<exclude>src/test/resources/syslog/syslog5424/log_mix_in_error.txt</exclude>
                         <exclude>src/test/resources/text/testschema</exclude>
+                        
<exclude>src/test/resources/xml/field_with_sub-element.xml</exclude>
                         <exclude>src/test/resources/xml/people.xml</exclude>
                         <exclude>src/test/resources/xml/people2.xml</exclude>
                         <exclude>src/test/resources/xml/people3.xml</exclude>
@@ -236,6 +237,7 @@
                         
<exclude>src/test/resources/xml/people_tag_in_characters.xml</exclude>
                         
<exclude>src/test/resources/xml/people_with_header_and_comments.xml</exclude>
                         <exclude>src/test/resources/xml/person.xml</exclude>
+                        
<exclude>src/test/resources/xml/person_record.xml</exclude>
                         <exclude>src/test/resources/xml/testschema</exclude>
                         <exclude>src/test/resources/xml/testschema2</exclude>
                         <exclude>src/test/resources/xml/testschema3</exclude>
diff --git 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLReader.java
 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLReader.java
index 52a9701507..20ee5195c6 100644
--- 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLReader.java
+++ 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLReader.java
@@ -97,7 +97,10 @@ public class XMLReader extends SchemaRegistryService 
implements RecordReaderFact
             .description("If tags with content (e. g. <field>content</field>) 
are defined as nested records in the schema, " +
                     "the name of the tag will be used as name for the record 
and the value of this property will be used as name for the field. " +
                     "If tags with content shall be parsed together with 
attributes (e. g. <field attribute=\"123\">content</field>), " +
-                    "they have to be defined as records. For additional 
information, see the section of processor usage.")
+                    "they have to be defined as records. In such a case, the 
name of the tag will be used as the name for the record and  " +
+                    "the value of this property will be used as the name for 
the field holding the original content. The name of the attribute " +
+                    "will be used to create a new record field, the content of 
which will be the value of the attribute. " +
+                    "For more information, see the 'Additional Details...' 
section of the XMLReader controller service's documentation.")
             .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
             
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
             .required(false)
@@ -136,7 +139,12 @@ public class XMLReader extends SchemaRegistryService 
implements RecordReaderFact
 
     @Override
     protected SchemaAccessStrategy getSchemaAccessStrategy(final String 
strategy, final SchemaRegistry schemaRegistry, final PropertyContext context) {
-        final RecordSourceFactory<XmlNode> sourceFactory = (variables, 
contentStream) -> new XmlRecordSource(contentStream, isMultipleRecords(context, 
variables));
+
+        final RecordSourceFactory<XmlNode> sourceFactory = (variables, 
contentStream) -> {
+            String contentFieldName = 
trim(context.getProperty(CONTENT_FIELD_NAME).evaluateAttributeExpressions(variables).getValue());
+            contentFieldName = (contentFieldName == null) ? "value" : 
contentFieldName;
+            return new XmlRecordSource(contentStream, contentFieldName, 
isMultipleRecords(context, variables));
+        };
         final Supplier<SchemaInferenceEngine<XmlNode>> schemaInference = () -> 
new XmlSchemaInference(new TimeValueInference(dateFormat, timeFormat, 
timestampFormat));
 
         return SchemaInferenceUtil.getSchemaAccessStrategy(strategy, context, 
getLogger(), sourceFactory, schemaInference,
diff --git 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLRecordReader.java
 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLRecordReader.java
index 2cb165b6e9..fd6d23b3e2 100644
--- 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLRecordReader.java
+++ 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLRecordReader.java
@@ -339,8 +339,8 @@ public class XMLRecordReader implements RecordReader {
                 if (contentFieldName != null) {
                     recordValues.put(contentFieldName, content.toString());
                 } else {
-                    logger.debug("Found content for field that has to be 
parsed as record but property \"Field Name for Content\" is not set. " +
-                            "The content will not be added to the record.");
+                    logger.debug("Found content for a field that was supposed 
to be named with the value of the \"Field Name for Content\" property but " +
+                            "the property was not set. The content was not 
added to the record.");
                 }
 
                 return new MapRecord(new 
SimpleRecordSchema(Collections.emptyList()), recordValues);
@@ -486,10 +486,13 @@ public class XMLRecordReader implements RecordReader {
                 if (field.isPresent()) {
                     Object value = parseStringForType(content.toString(), 
contentFieldName, field.get().getDataType());
                     recordValues.put(contentFieldName, value);
+                } else {
+                    logger.debug("Found content for a field that was supposed 
to be named with the value of the \"Field Name for Content\" property " +
+                            "but no such field was present in the schema. The 
content was not added to the record.");
                 }
             } else {
-                logger.debug("Found content for field that is defined as 
record but property \"Field Name for Content\" is not set. " +
-                        "The content will not be added to record.");
+                logger.debug("Found content for a field that was supposed to 
be named with the value of the \"Field Name for Content\" property but " +
+                        "the property was not set. The content was not added 
to the record.");
             }
         }
 
diff --git 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/inference/XmlRecordSource.java
 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/inference/XmlRecordSource.java
index 3192e141aa..8352aed08d 100644
--- 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/inference/XmlRecordSource.java
+++ 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/inference/XmlRecordSource.java
@@ -35,8 +35,10 @@ import java.util.Map;
 public class XmlRecordSource implements RecordSource<XmlNode> {
 
     private final XMLEventReader xmlEventReader;
+    private final String contentFieldName;
 
-    public XmlRecordSource(final InputStream in, final boolean ignoreWrapper) 
throws IOException {
+    public XmlRecordSource(final InputStream in, final String 
contentFieldName, final boolean ignoreWrapper) throws IOException {
+        this.contentFieldName = contentFieldName;
         try {
             final XMLInputFactory xmlInputFactory = 
XMLInputFactory.newInstance();
 
@@ -125,7 +127,7 @@ public class XmlRecordSource implements 
RecordSource<XmlNode> {
         } else {
             final String textContent = content.toString().trim();
             if (!textContent.equals("")) {
-                childNodes.put("value", new XmlTextNode("value", textContent));
+                childNodes.put(contentFieldName, new 
XmlTextNode(contentFieldName, textContent));
             }
 
             return new XmlContainerNode(nodeName, childNodes);
diff --git 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/resources/docs/org.apache.nifi.xml.XMLReader/additionalDetails.html
 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/resources/docs/org.apache.nifi.xml.XMLReader/additionalDetails.html
index b8e1dff0d8..adcb3cc550 100755
--- 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/resources/docs/org.apache.nifi.xml.XMLReader/additionalDetails.html
+++ 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/resources/docs/org.apache.nifi.xml.XMLReader/additionalDetails.html
@@ -286,6 +286,357 @@
         for tags containing attributes and content.
     </p>
 
+    <h2>Example: Tags with Attributes and Schema Inference</h2>
+
+    <p>
+        When the record's schema is not provided but inferred based on the 
data itself, providing a value for the "Field Name for Content" property
+        is especially important. (For detailed information on schema 
inference, see the "Schema Inference" section below.)
+        Let's focus on cases where an XML element (called 
<code>&lt;field_with_attribute&gt;</code> in the examples) has an XML attribute 
and some content and no sub-elements.
+        For the examples below, let's assume that a ConvertRecord processor is 
used, and it uses an XMLReader controller service and an XMLRecordSetWriter
+        controller service. The settings for XMLReader are provided separately 
for each example. The settings for XMLRecordSetWriter are common
+        for all the examples below. This way an XML to XML conversion is 
executed and comparing the input data with the output highlights
+        the schema inference behavior. The same behavior can be observed if a 
different Writer controller service is used.
+        XMLRecordSetWriter was chosen for these examples so that the input and 
the output are easily comparable.
+        The settings of the common XMLRecordSetWriter are the following:
+    </p>
+
+    <table>
+        <tr>
+            <th>Property Name</th>
+            <th>Property Value</th>
+        </tr>
+        <tr>
+            <td>Schema Access Strategy</td>
+            <td><code>Inherit Record Schema</code></td>
+        </tr>
+        <tr>
+            <td>Suppress Null Values</td>
+            <td><code>Never Suppress</code></td>
+        </tr>
+    </table>
+
+    <h3>XML Attributes and Schema Inference Example 1</h3>
+
+    <p>
+        XMLReader settings:
+    </p>
+
+    <table>
+        <tr>
+            <th>Property Name</th>
+            <th>Property Value</th>
+        </tr>
+        <tr>
+            <td>Schema Access Strategy</td>
+            <td><code>Infer Schema</code></td>
+        </tr>
+        <tr>
+            <td>Expect Records as Array</td>
+            <td><code>false</code></td>
+        </tr>
+        <tr>
+            <td>Field Name for Content</td>
+            <td>not set</td>
+        </tr>
+    </table>
+
+    <p>
+        Input:
+    </p>
+
+    <code>
+            <pre>
+                &lt;record&gt;
+                    &lt;field_with_attribute attr="attr_content"&gt;
+                        content of field
+                    &lt;/field_with_attribute&gt;
+                &lt;/record&gt;</pre>
+    </code>
+
+    <p>As mentioned above, the element called "field_with_attribute" has an 
attribute and some content but no sub-element.</p>
+
+    <p>
+        Output:
+    </p>
+
+    <code>
+            <pre>
+                &lt;record&gt;
+                    &lt;field_with_attribute&gt;
+                        &lt;attr&gt;attr_content&lt;/attr&gt;
+                        &lt;value&gt;&lt;/value&gt;
+                    &lt;/field_with_attribute&gt;
+                &lt;/record&gt;</pre>
+    </code>
+
+    <p>
+        In the XMLReader's settings, no value is set for the "Field Name for 
Content" property. In such cases the schema inference logic
+        adds a field named "value" to the schema. However, since "Field Name 
for Content" is not set, the data processing logic is instructed
+        not to consider the original content of the parent XML tags 
(<code>&lt;field_with_attribute&gt;</code> the content of which is "content of 
field"
+        in the example). So a new field named "value" appears in the schema 
but no value is assigned to it from the data, thus the field is empty.
+        The XML attribute (named "attr") is processed, a field named "attr" is 
added to the schema and the attribute's value ("attr_content") is assigned to 
it.
+        In a case like this, the parent field's original content is lost and a 
new field named "value" appears in the schema with no data assigned to it.
+        This is to make sure that no data is overwritten in the record if it 
already contains a field named "value". More on that case in Example 3 and 
Example 4.
+    </p>
+
+    <h3>XML Attributes and Schema Inference Example 2</h3>
+
+    <p>
+        In this example, the XMLReader's "Field Name for Content" property is 
filled with the value "original_content". The input data is the same as
+        in the previous example.
+    </p>
+
+    <p>
+        XMLReader settings:
+    </p>
+
+    <table>
+        <tr>
+            <th>Property Name</th>
+            <th>Property Value</th>
+        </tr>
+        <tr>
+            <td>Schema Access Strategy</td>
+            <td><code>Infer Schema</code></td>
+        </tr>
+        <tr>
+            <td>Expect Records as Array</td>
+            <td><code>false</code></td>
+        </tr>
+        <tr>
+            <td>Field Name for Content</td>
+            <td><code>original_content</code></td>
+        </tr>
+    </table>
+
+    <p>
+        Input:
+    </p>
+
+    <code>
+            <pre>
+                &lt;record&gt;
+                    &lt;field_with_attribute attr="attr_content"&gt;
+                        content of field
+                    &lt;/field_with_attribute&gt;
+                &lt;/record&gt;</pre>
+    </code>
+
+    <p>
+        Output:
+    </p>
+
+    <code>
+            <pre>
+                &lt;record&gt;
+                    &lt;field_with_attribute&gt;
+                        &lt;attr&gt;attr_content&lt;/attr&gt;
+                        &lt;original_content&gt;content of 
field&lt;/original_content&gt;
+                    &lt;/field_with_attribute&gt;
+                &lt;/record&gt;</pre>
+    </code>
+
+    <p>
+        The XMLReader's "Field Name for Content" property contains the value 
"original_content" (the concrete value is not important, what is important
+        is that a value is provided and it does not clash with the name of any 
sub-element in <code>&lt;field_with_attribute&gt;</code>).
+        This explicitly tells the XMLReader controller service to create a 
field named "original_content" and make the original content of
+        the parent XML tag the value of the field named "original_content". 
Adding the XML attributed named "attr" works just like in the first example.
+        Since the <code>&lt;field_with_attribute&gt;</code> element had no 
child-element with the name "original_content", no data is lost.
+    </p>
+
+    <h3>XML Attributes and Schema Inference Example 3</h3>
+
+    <p>
+        In this example, XMLReader's "Field Name for Content" property is left 
empty. In the input data, the <code>&lt;field_with_attribute&gt;</code> element
+        has some content and a sub-element named <code>&lt;value&gt;</code>.
+    </p>
+
+    <p>
+        XMLReader settings:
+    </p>
+
+    <table>
+        <tr>
+            <th>Property Name</th>
+            <th>Property Value</th>
+        </tr>
+        <tr>
+            <td>Schema Access Strategy</td>
+            <td><code>Infer Schema</code></td>
+        </tr>
+        <tr>
+            <td>Expect Records as Array</td>
+            <td><code>false</code></td>
+        </tr>
+        <tr>
+            <td>Field Name for Content</td>
+            <td>not set</td>
+        </tr>
+    </table>
+
+    <p>
+        Input:
+    </p>
+
+    <code>
+            <pre>
+                &lt;record&gt;
+                    &lt;field_with_attribute attr="attr_content"&gt;
+                          content of field&lt;value&gt;123&lt;/value&gt;
+                    &lt;/field_with_attribute&gt;
+                &lt;/record&gt;</pre>
+    </code>
+
+    <p>
+        Output:
+    </p>
+
+    <code>
+            <pre>
+                &lt;record&gt;
+                    &lt;field_with_attribute&gt;
+                        &lt;attr&gt;attr_content&lt;/attr&gt;
+                        &lt;value&gt;123&lt;/value&gt;
+                    &lt;/field_with_attribute&gt;
+                &lt;/record&gt;</pre>
+    </code>
+
+    <p>
+        The "Field Name for Content" property is not set, and the XML element 
has a sub-element named "value". The name of the sub-element clashes with the
+        default field name added to the schema by the Schema Inference logic 
(see Example 1). As seen in the output data, the input XML attribute's value
+        is added to the record just like in the previous examples. The value 
of the <code>&lt;value&gt;</code> element is retained, but the content of the
+        <code>&lt;field_with_attribute&gt;</code> that was outside of the 
sub-element, is lost.
+    </p>
+
+    <h3>XML Attributes and Schema Inference Example 4</h3>
+
+    <p>
+        In this example, XMLReader's "Field Name for Content" property is 
given the value "value". In the input data, the 
<code>&lt;field_with_attribute&gt;</code> element
+       has some content and a sub-element named <code>&lt;value&gt;</code>. 
The name of the sub-element clashes with the value of the "Field Name for 
Content" property.
+    </p>
+
+    <p>
+        XMLReader settings:
+    </p>
+
+    <table>
+        <tr>
+            <th>Property Name</th>
+            <th>Property Value</th>
+        </tr>
+        <tr>
+            <td>Schema Access Strategy</td>
+            <td><code>Infer Schema</code></td>
+        </tr>
+        <tr>
+            <td>Expect Records as Array</td>
+            <td><code>false</code></td>
+        </tr>
+        <tr>
+            <td>Field Name for Content</td>
+            <td><code>value</code></td>
+        </tr>
+    </table>
+
+    <p>
+        Input:
+    </p>
+
+    <code>
+            <pre>
+                &lt;record&gt;
+                    &lt;field_with_attribute attr="attr_content"&gt;
+                          content of field&lt;value&gt;123&lt;/value&gt;
+                    &lt;/field_with_attribute&gt;
+                &lt;/record&gt;</pre>
+    </code>
+
+    <p>
+        Output:
+    </p>
+
+    <code>
+            <pre>
+                &lt;record&gt;
+                    &lt;field_with_attribute&gt;
+                        &lt;attr&gt;attr_content&lt;/attr&gt;
+                        &lt;value&gt;content of field&lt;/value&gt;
+                    &lt;/field_with_attribute&gt;
+                &lt;/record&gt;</pre>
+    </code>
+
+    <p>
+        The "Field Name for Content" property's value is "value", and the XML 
element has a sub-element named "value". The name of the sub-element clashes 
with the
+        value of the "Field Name for Content" property. The value of the 
<code>&lt;value&gt;</code> element is replaced by the content of the
+        <code>&lt;field_with_attribute&gt;</code> element, and the original 
content of the <code>&lt;value&gt;</code> element is lost.
+    </p>
+
+    <h3>XML Attributes and Schema Inference Example 5</h3>
+
+    <p>
+        To avoid losing any data, the XMLReader's "Field Name for Content" 
property needs to be given a value that does not clash with any sub-element's 
name
+        in the input data. In this example the input data is the same as in 
the previous one, but the "Field Name for Content" property's value is 
"original_content",
+        a value that does not clash with any sub-element name. No data is lost 
in this case.
+    </p>
+
+    <p>
+        XMLReader settings:
+    </p>
+
+    <table>
+        <tr>
+            <th>Property Name</th>
+            <th>Property Value</th>
+        </tr>
+        <tr>
+            <td>Schema Access Strategy</td>
+            <td><code>Infer Schema</code></td>
+        </tr>
+        <tr>
+            <td>Expect Records as Array</td>
+            <td><code>false</code></td>
+        </tr>
+        <tr>
+            <td>Field Name for Content</td>
+            <td><code>original_content</code></td>
+        </tr>
+    </table>
+
+    <p>
+        Input:
+    </p>
+
+    <code>
+            <pre>
+                &lt;record&gt;
+                    &lt;field_with_attribute attr="attr_content"&gt;
+                          content of field&lt;value&gt;123&lt;/value&gt;
+                    &lt;/field_with_attribute&gt;
+                &lt;/record&gt;</pre>
+    </code>
+
+    <p>
+        Output:
+    </p>
+
+    <code>
+            <pre>
+                &lt;record&gt;
+                    &lt;field_with_attribute&gt;
+                        &lt;attr&gt;attr_content&lt;/attr&gt;
+                        &lt;value&gt;123&lt;/value&gt;
+                        &lt;original_content&gt;content of 
field&lt;/original_content&gt;
+                    &lt;/field_with_attribute&gt;
+                &lt;/record&gt;</pre>
+    </code>
+
+    <p>
+        It can be seen in the output data, that the attribute has been added 
to the <code>&lt;field_with_attribute&gt;</code> element as a sub-element,
+        the <code>&lt;value&gt;</code> retained its value, and the original 
content of the <code>&lt;field_with_attribute&gt;</code> element has been added 
as a sub-element
+        named "original_content". This is because a value was chosen for the 
"Field Name for Content" property that does not clash with any of
+        the existing sub-elements of the input XML element 
(<code>&lt;field_with_attribute&gt;</code>). No data is lost.
+    </p>
+
     <h2>Example: Array of records</h2>
 
     <p>
diff --git 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestInferXmlSchema.java
 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestInferXmlSchema.java
index b5bdd76025..2a4cd14f10 100644
--- 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestInferXmlSchema.java
+++ 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestInferXmlSchema.java
@@ -93,7 +93,8 @@ public class TestInferXmlSchema {
 
     @Test
     public void testStringFieldWithAttributes() throws IOException {
-        final RecordSchema schema = 
inferSchema("src/test/resources/xml/TextNodeWithAttribute.xml", true);
+        final String contentFieldName = "contentfield";
+        final RecordSchema schema = 
inferSchema("src/test/resources/xml/TextNodeWithAttribute.xml", 
contentFieldName, true);
 
         assertEquals(3, schema.getFieldCount());
 
@@ -106,12 +107,16 @@ public class TestInferXmlSchema {
 
         final RecordSchema childSchema = ((RecordDataType) 
softwareDataType).getChildSchema();
         assertSame(RecordFieldType.BOOLEAN, 
childSchema.getDataType("favorite").get().getFieldType());
-        assertSame(RecordFieldType.STRING, 
childSchema.getDataType("value").get().getFieldType());
+        assertSame(RecordFieldType.STRING, 
childSchema.getDataType(contentFieldName).get().getFieldType());
     }
 
     private RecordSchema inferSchema(final String filename, final boolean 
ignoreWrapper) throws IOException {
+        return inferSchema(filename, "contentfield", ignoreWrapper);
+    }
+
+    private RecordSchema inferSchema(final String filename, final String 
contentFieldName, final boolean ignoreWrapper) throws IOException {
         final File file = new File(filename);
-        final RecordSourceFactory<XmlNode> xmlSourceFactory = (var, in) ->  
new XmlRecordSource(in, ignoreWrapper);
+        final RecordSourceFactory<XmlNode> xmlSourceFactory = (var, in) ->  
new XmlRecordSource(in, contentFieldName, ignoreWrapper);
         final SchemaInferenceEngine<XmlNode> schemaInference = new 
XmlSchemaInference(timeValueInference);
         final InferSchemaAccessStrategy<XmlNode> inferStrategy = new 
InferSchemaAccessStrategy<>(xmlSourceFactory, schemaInference, 
Mockito.mock(ComponentLog.class));
 
diff --git 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestXMLReader.java
 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestXMLReader.java
index 5de7eac402..6904a6f599 100644
--- 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestXMLReader.java
+++ 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestXMLReader.java
@@ -17,8 +17,10 @@
 
 package org.apache.nifi.xml;
 
+import org.apache.nifi.components.PropertyDescriptor;
 import org.apache.nifi.reporting.InitializationException;
 import org.apache.nifi.schema.access.SchemaAccessUtils;
+import org.apache.nifi.schema.inference.SchemaInferenceUtil;
 import org.apache.nifi.util.MockFlowFile;
 import org.apache.nifi.util.TestRunner;
 import org.apache.nifi.util.TestRunners;
@@ -31,43 +33,48 @@ import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 import static junit.framework.TestCase.assertEquals;
 
 public class TestXMLReader {
 
-    private XMLReader reader;
-
     private final String ATTRIBUTE_PREFIX = "attribute_prefix";
     private final String CONTENT_NAME = "content_field";
     private final String EVALUATE_IS_ARRAY = "xml.stream.is.array";
 
-    public TestRunner setup(String filePath) throws InitializationException, 
IOException {
-
+    private TestRunner setup(Map<PropertyDescriptor, String> 
xmlReaderProperties) throws InitializationException {
         TestRunner runner = 
TestRunners.newTestRunner(TestXMLReaderProcessor.class);
-        reader = new XMLReader();
+        XMLReader reader = new XMLReader();
+
         runner.addControllerService("xml_reader", reader);
         runner.setProperty(TestXMLReaderProcessor.XML_READER, "xml_reader");
 
-        final String outputSchemaText = new 
String(Files.readAllBytes(Paths.get(filePath)));
-        runner.setProperty(reader, SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, 
SchemaAccessUtils.SCHEMA_TEXT_PROPERTY);
-        runner.setProperty(reader, SchemaAccessUtils.SCHEMA_TEXT, 
outputSchemaText);
+        for (Map.Entry<PropertyDescriptor, String> entry : 
xmlReaderProperties.entrySet()) {
+            runner.setProperty(reader, entry.getKey(), entry.getValue());
+        }
 
+        runner.enableControllerService(reader);
         return runner;
     }
 
     @Test
-    public void testRecordFormat() throws IOException, InitializationException 
{
-        TestRunner runner = setup("src/test/resources/xml/testschema");
+    public void testRecordFormatDeterminedBasedOnAttribute() throws 
IOException, InitializationException {
+        String outputSchemaPath = "src/test/resources/xml/testschema";
+        String outputSchemaText = new 
String(Files.readAllBytes(Paths.get(outputSchemaPath)));
 
-        runner.setProperty(reader, XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_EVALUATE);
+        Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, 
SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, 
outputSchemaText);
+        xmlReaderProperties.put(XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_EVALUATE.getValue());
+        TestRunner runner = setup(xmlReaderProperties);
 
-        runner.enableControllerService(reader);
-
-        InputStream is = new 
FileInputStream("src/test/resources/xml/people.xml");
-        runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, 
"true"));
-        runner.run();
+        try (InputStream is = new 
FileInputStream("src/test/resources/xml/people.xml")) {
+            runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, 
"true"));
+            runner.run();
+        }
 
         List<MockFlowFile> flowFile = 
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
         List<String> records = Arrays.asList((new 
String(runner.getContentAsByteArray(flowFile.get(0)))).split("\n"));
@@ -76,16 +83,20 @@ public class TestXMLReader {
     }
 
     @Test
-    public void testRecordFormat2() throws IOException, 
InitializationException {
-        TestRunner runner = setup("src/test/resources/xml/testschema");
+    public void testRecordFormatArray() throws IOException, 
InitializationException {
+        String outputSchemaPath = "src/test/resources/xml/testschema";
+        String outputSchemaText = new 
String(Files.readAllBytes(Paths.get(outputSchemaPath)));
 
-        runner.setProperty(reader, XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_ARRAY);
-
-        runner.enableControllerService(reader);
+        Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, 
SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, 
outputSchemaText);
+        xmlReaderProperties.put(XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_ARRAY.getValue());
+        TestRunner runner = setup(xmlReaderProperties);
 
-        InputStream is = new 
FileInputStream("src/test/resources/xml/people.xml");
-        runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, 
"true"));
-        runner.run();
+        try (InputStream is = new 
FileInputStream("src/test/resources/xml/people.xml")) {
+            runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, 
"true"));
+            runner.run();
+        }
 
         List<MockFlowFile> flowFile = 
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
         List<String> records = Arrays.asList((new 
String(runner.getContentAsByteArray(flowFile.get(0)))).split("\n"));
@@ -94,16 +105,20 @@ public class TestXMLReader {
     }
 
     @Test
-    public void testRecordFormat3() throws IOException, 
InitializationException {
-        TestRunner runner = setup("src/test/resources/xml/testschema");
+    public void testRecordFormatNotArray() throws IOException, 
InitializationException {
+        String outputSchemaPath = "src/test/resources/xml/testschema";
+        String outputSchemaText = new 
String(Files.readAllBytes(Paths.get(outputSchemaPath)));
 
-        runner.setProperty(reader, XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_SINGLE);
+        Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, 
SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, 
outputSchemaText);
+        xmlReaderProperties.put(XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_SINGLE.getValue());
+        TestRunner runner = setup(xmlReaderProperties);
 
-        runner.enableControllerService(reader);
-
-        InputStream is = new 
FileInputStream("src/test/resources/xml/person.xml");
-        runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, 
"true"));
-        runner.run();
+        try (InputStream is = new 
FileInputStream("src/test/resources/xml/person.xml")) {
+            runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY, 
"true"));
+            runner.run();
+        }
 
         List<MockFlowFile> flowFile = 
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
         List<String> records = Arrays.asList(new 
String(runner.getContentAsByteArray(flowFile.get(0))).split("\n"));
@@ -113,16 +128,20 @@ public class TestXMLReader {
 
     @Test
     public void testAttributePrefix() throws IOException, 
InitializationException {
-        TestRunner runner = setup("src/test/resources/xml/testschema");
+        String outputSchemaPath = "src/test/resources/xml/testschema";
+        String outputSchemaText = new 
String(Files.readAllBytes(Paths.get(outputSchemaPath)));
 
-        runner.setProperty(reader, XMLReader.ATTRIBUTE_PREFIX, "${" + 
ATTRIBUTE_PREFIX + "}");
-        runner.setProperty(reader, XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_ARRAY);
-
-        runner.enableControllerService(reader);
+        Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, 
SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, 
outputSchemaText);
+        xmlReaderProperties.put(XMLReader.ATTRIBUTE_PREFIX, "${" + 
ATTRIBUTE_PREFIX + "}");
+        xmlReaderProperties.put(XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_ARRAY.getValue());
+        TestRunner runner = setup(xmlReaderProperties);
 
-        InputStream is = new 
FileInputStream("src/test/resources/xml/people.xml");
-        runner.enqueue(is, Collections.singletonMap(ATTRIBUTE_PREFIX, 
"ATTR_"));
-        runner.run();
+        try (InputStream is = new 
FileInputStream("src/test/resources/xml/people.xml")) {
+            runner.enqueue(is, Collections.singletonMap(ATTRIBUTE_PREFIX, 
"ATTR_"));
+            runner.run();
+        }
 
         List<MockFlowFile> flowFile = 
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
         List<String> records = Arrays.asList(new 
String(runner.getContentAsByteArray(flowFile.get(0))).split("\n"));
@@ -136,16 +155,20 @@ public class TestXMLReader {
 
     @Test
     public void testContentField() throws IOException, InitializationException 
{
-        TestRunner runner = setup("src/test/resources/xml/testschema2");
+        String outputSchemaPath = "src/test/resources/xml/testschema2";
+        String outputSchemaText = new 
String(Files.readAllBytes(Paths.get(outputSchemaPath)));
 
-        runner.setProperty(reader, XMLReader.CONTENT_FIELD_NAME, "${" + 
CONTENT_NAME + "}");
-        runner.setProperty(reader, XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_ARRAY);
+        Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, 
SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT, 
outputSchemaText);
+        xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, "${" + 
CONTENT_NAME + "}");
+        xmlReaderProperties.put(XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_ARRAY.getValue());
+        TestRunner runner = setup(xmlReaderProperties);
 
-        runner.enableControllerService(reader);
-
-        InputStream is = new 
FileInputStream("src/test/resources/xml/people_tag_in_characters.xml");
-        runner.enqueue(is, Collections.singletonMap(CONTENT_NAME, "CONTENT"));
-        runner.run();
+        try (InputStream is = new 
FileInputStream("src/test/resources/xml/people_tag_in_characters.xml")) {
+            runner.enqueue(is, Collections.singletonMap(CONTENT_NAME, 
"CONTENT"));
+            runner.run();
+        }
 
         List<MockFlowFile> flowFile = 
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
         List<String> records = Arrays.asList(new 
String(runner.getContentAsByteArray(flowFile.get(0))).split("\n"));
@@ -157,4 +180,103 @@ public class TestXMLReader {
         assertEquals("MapRecord[{ID=P4, NAME=MapRecord[{CONTENT=Elenora 
Scrivens, ATTR=attr content, INNER=inner content}], AGE=16}]", records.get(3));
         assertEquals("MapRecord[{ID=P5, NAME=MapRecord[{INNER=inner 
content}]}]", records.get(4));
     }
+
+    @Test
+    public void testInferSchema() throws InitializationException, IOException {
+        String expectedContent = "MapRecord[{software=MapRecord[{" + 
CONTENT_NAME + "=Apache NiFi, favorite=true}], num=123, name=John Doe}]";
+
+        Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, 
SchemaInferenceUtil.INFER_SCHEMA.getValue());
+        xmlReaderProperties.put(XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_SINGLE.getValue());
+        xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, CONTENT_NAME);
+        TestRunner runner = setup(xmlReaderProperties);
+
+        try (InputStream is = new 
FileInputStream("src/test/resources/xml/person_record.xml")) {
+            runner.enqueue(is);
+            runner.run();
+        }
+
+        MockFlowFile out = 
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
+        String actualContent = out.getContent();
+        assertEquals(expectedContent, actualContent);
+    }
+
+    @Test
+    public void testInferSchemaContentFieldNameNotSet() throws 
InitializationException, IOException {
+        String expectedContent = 
"MapRecord[{software=MapRecord[{favorite=true}], num=123, name=John Doe}]";
+
+        Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, 
SchemaInferenceUtil.INFER_SCHEMA.getValue());
+        xmlReaderProperties.put(XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_SINGLE.getValue());
+        TestRunner runner = setup(xmlReaderProperties);
+
+        try (InputStream is = new 
FileInputStream("src/test/resources/xml/person_record.xml")) {
+            runner.enqueue(is);
+            runner.run();
+        }
+
+        MockFlowFile out = 
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
+        String actualContent = out.getContent();
+        assertEquals(expectedContent, actualContent);
+    }
+
+    @Test
+    public void testInferSchemaContentFieldNameNotSetSubElementExists() throws 
InitializationException, IOException {
+        String expectedContent = 
"MapRecord[{field_with_attribute=MapRecord[{attr=attr_content, value=123}]}]";
+
+        Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, 
SchemaInferenceUtil.INFER_SCHEMA.getValue());
+        xmlReaderProperties.put(XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_SINGLE.getValue());
+        TestRunner runner = setup(xmlReaderProperties);
+
+        try (InputStream is = new 
FileInputStream("src/test/resources/xml/field_with_sub-element.xml")) {
+            runner.enqueue(is);
+            runner.run();
+        }
+
+        MockFlowFile out = 
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
+        String actualContent = out.getContent();
+        assertEquals(expectedContent, actualContent);
+    }
+
+    @Test
+    public void testInferSchemaContentFieldNameSetSubElementExistsNameClash() 
throws InitializationException, IOException {
+        String expectedContent = 
"MapRecord[{field_with_attribute=MapRecord[{attr=attr_content, value=content of 
field}]}]";
+
+        Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, 
SchemaInferenceUtil.INFER_SCHEMA.getValue());
+        xmlReaderProperties.put(XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_SINGLE.getValue());
+        xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, "value");
+        TestRunner runner = setup(xmlReaderProperties);
+
+        try (InputStream is = new 
FileInputStream("src/test/resources/xml/field_with_sub-element.xml")) {
+            runner.enqueue(is);
+            runner.run();
+        }
+
+        MockFlowFile out = 
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
+        String actualContent = out.getContent();
+        assertEquals(expectedContent, actualContent);
+    }
+
+    @Test
+    public void 
testInferSchemaContentFieldNameSetSubElementExistsNoNameClash() throws 
InitializationException, IOException {
+        String expectedContent = "MapRecord[{field_with_attribute=MapRecord[{" 
+CONTENT_NAME + "=content of field, " +
+                "attr=attr_content, value=123}]}]";
+
+        Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+        xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, 
SchemaInferenceUtil.INFER_SCHEMA.getValue());
+        xmlReaderProperties.put(XMLReader.RECORD_FORMAT, 
XMLReader.RECORD_SINGLE.getValue());
+        xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, CONTENT_NAME);
+        TestRunner runner = setup(xmlReaderProperties);
+
+        try (InputStream is = new 
FileInputStream("src/test/resources/xml/field_with_sub-element.xml")) {
+            runner.enqueue(is);
+            runner.run();
+        }
+
+        MockFlowFile out = 
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
+        String actualContent = out.getContent();
+        assertEquals(expectedContent, actualContent);
+    }
 }
diff --git 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/field_with_sub-element.xml
 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/field_with_sub-element.xml
new file mode 100644
index 0000000000..2c9146119d
--- /dev/null
+++ 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/field_with_sub-element.xml
@@ -0,0 +1,4 @@
+<record>
+    <field_with_attribute attr="attr_content">content of 
field<value>123</value>
+    </field_with_attribute>
+</record>
\ No newline at end of file
diff --git 
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/person_record.xml
 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/person_record.xml
new file mode 100644
index 0000000000..08b39093e0
--- /dev/null
+++ 
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/person_record.xml
@@ -0,0 +1,5 @@
+<record>
+    <num>123</num>
+    <name>John Doe</name>
+    <software favorite="true">Apache NiFi</software>
+</record>
\ No newline at end of file

[nifi] 01/03: NIFI-9832: Fix disappearing XML element content when the element has attribute (#5896)

Reply via email to