This is an automated email from the ASF dual-hosted git repository.
markap14 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git
The following commit(s) were added to refs/heads/main by this push:
new 68c6722f76 NIFI-9832: Fix disappearing XML element content when the
element has attribute (#5896)
68c6722f76 is described below
commit 68c6722f76aedd24805ff4999331af5e97ccf9b2
Author: Peter Gyori <[email protected]>
AuthorDate: Tue Apr 12 17:41:50 2022 +0200
NIFI-9832: Fix disappearing XML element content when the element has
attribute (#5896)
- NIFI-9832: Additional test cases for XMLReader
---
.../nifi-record-serialization-services/pom.xml | 2 +
.../main/java/org/apache/nifi/xml/XMLReader.java | 12 +-
.../java/org/apache/nifi/xml/XMLRecordReader.java | 11 +-
.../apache/nifi/xml/inference/XmlRecordSource.java | 6 +-
.../additionalDetails.html | 351 +++++++++++++++++++++
.../org/apache/nifi/xml/TestInferXmlSchema.java | 11 +-
.../java/org/apache/nifi/xml/TestXMLReader.java | 218 ++++++++++---
.../test/resources/xml/field_with_sub-element.xml | 4 +
.../src/test/resources/xml/person_record.xml | 5 +
9 files changed, 561 insertions(+), 59 deletions(-)
diff --git
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml
index e04159e001..e851565994 100755
---
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml
+++
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml
@@ -218,6 +218,7 @@
<exclude>src/test/resources/syslog/syslog5424/log_mix.txt</exclude>
<exclude>src/test/resources/syslog/syslog5424/log_mix_in_error.txt</exclude>
<exclude>src/test/resources/text/testschema</exclude>
+
<exclude>src/test/resources/xml/field_with_sub-element.xml</exclude>
<exclude>src/test/resources/xml/people.xml</exclude>
<exclude>src/test/resources/xml/people2.xml</exclude>
<exclude>src/test/resources/xml/people3.xml</exclude>
@@ -236,6 +237,7 @@
<exclude>src/test/resources/xml/people_tag_in_characters.xml</exclude>
<exclude>src/test/resources/xml/people_with_header_and_comments.xml</exclude>
<exclude>src/test/resources/xml/person.xml</exclude>
+
<exclude>src/test/resources/xml/person_record.xml</exclude>
<exclude>src/test/resources/xml/testschema</exclude>
<exclude>src/test/resources/xml/testschema2</exclude>
<exclude>src/test/resources/xml/testschema3</exclude>
diff --git
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLReader.java
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLReader.java
index 52a9701507..20ee5195c6 100644
---
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLReader.java
+++
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLReader.java
@@ -97,7 +97,10 @@ public class XMLReader extends SchemaRegistryService
implements RecordReaderFact
.description("If tags with content (e. g. <field>content</field>)
are defined as nested records in the schema, " +
"the name of the tag will be used as name for the record
and the value of this property will be used as name for the field. " +
"If tags with content shall be parsed together with
attributes (e. g. <field attribute=\"123\">content</field>), " +
- "they have to be defined as records. For additional
information, see the section of processor usage.")
+ "they have to be defined as records. In such a case, the
name of the tag will be used as the name for the record and " +
+ "the value of this property will be used as the name for
the field holding the original content. The name of the attribute " +
+ "will be used to create a new record field, the content of
which will be the value of the attribute. " +
+ "For more information, see the 'Additional Details...'
section of the XMLReader controller service's documentation.")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.required(false)
@@ -136,7 +139,12 @@ public class XMLReader extends SchemaRegistryService
implements RecordReaderFact
@Override
protected SchemaAccessStrategy getSchemaAccessStrategy(final String
strategy, final SchemaRegistry schemaRegistry, final PropertyContext context) {
- final RecordSourceFactory<XmlNode> sourceFactory = (variables,
contentStream) -> new XmlRecordSource(contentStream, isMultipleRecords(context,
variables));
+
+ final RecordSourceFactory<XmlNode> sourceFactory = (variables,
contentStream) -> {
+ String contentFieldName =
trim(context.getProperty(CONTENT_FIELD_NAME).evaluateAttributeExpressions(variables).getValue());
+ contentFieldName = (contentFieldName == null) ? "value" :
contentFieldName;
+ return new XmlRecordSource(contentStream, contentFieldName,
isMultipleRecords(context, variables));
+ };
final Supplier<SchemaInferenceEngine<XmlNode>> schemaInference = () ->
new XmlSchemaInference(new TimeValueInference(dateFormat, timeFormat,
timestampFormat));
return SchemaInferenceUtil.getSchemaAccessStrategy(strategy, context,
getLogger(), sourceFactory, schemaInference,
diff --git
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLRecordReader.java
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLRecordReader.java
index 2cb165b6e9..fd6d23b3e2 100644
---
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLRecordReader.java
+++
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/XMLRecordReader.java
@@ -339,8 +339,8 @@ public class XMLRecordReader implements RecordReader {
if (contentFieldName != null) {
recordValues.put(contentFieldName, content.toString());
} else {
- logger.debug("Found content for field that has to be
parsed as record but property \"Field Name for Content\" is not set. " +
- "The content will not be added to the record.");
+ logger.debug("Found content for a field that was supposed
to be named with the value of the \"Field Name for Content\" property but " +
+ "the property was not set. The content was not
added to the record.");
}
return new MapRecord(new
SimpleRecordSchema(Collections.emptyList()), recordValues);
@@ -486,10 +486,13 @@ public class XMLRecordReader implements RecordReader {
if (field.isPresent()) {
Object value = parseStringForType(content.toString(),
contentFieldName, field.get().getDataType());
recordValues.put(contentFieldName, value);
+ } else {
+ logger.debug("Found content for a field that was supposed
to be named with the value of the \"Field Name for Content\" property " +
+ "but no such field was present in the schema. The
content was not added to the record.");
}
} else {
- logger.debug("Found content for field that is defined as
record but property \"Field Name for Content\" is not set. " +
- "The content will not be added to record.");
+ logger.debug("Found content for a field that was supposed to
be named with the value of the \"Field Name for Content\" property but " +
+ "the property was not set. The content was not added
to the record.");
}
}
diff --git
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/inference/XmlRecordSource.java
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/inference/XmlRecordSource.java
index 3192e141aa..8352aed08d 100644
---
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/inference/XmlRecordSource.java
+++
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/xml/inference/XmlRecordSource.java
@@ -35,8 +35,10 @@ import java.util.Map;
public class XmlRecordSource implements RecordSource<XmlNode> {
private final XMLEventReader xmlEventReader;
+ private final String contentFieldName;
- public XmlRecordSource(final InputStream in, final boolean ignoreWrapper)
throws IOException {
+ public XmlRecordSource(final InputStream in, final String
contentFieldName, final boolean ignoreWrapper) throws IOException {
+ this.contentFieldName = contentFieldName;
try {
final XMLInputFactory xmlInputFactory =
XMLInputFactory.newInstance();
@@ -125,7 +127,7 @@ public class XmlRecordSource implements
RecordSource<XmlNode> {
} else {
final String textContent = content.toString().trim();
if (!textContent.equals("")) {
- childNodes.put("value", new XmlTextNode("value", textContent));
+ childNodes.put(contentFieldName, new
XmlTextNode(contentFieldName, textContent));
}
return new XmlContainerNode(nodeName, childNodes);
diff --git
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/resources/docs/org.apache.nifi.xml.XMLReader/additionalDetails.html
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/resources/docs/org.apache.nifi.xml.XMLReader/additionalDetails.html
index b8e1dff0d8..adcb3cc550 100755
---
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/resources/docs/org.apache.nifi.xml.XMLReader/additionalDetails.html
+++
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/resources/docs/org.apache.nifi.xml.XMLReader/additionalDetails.html
@@ -286,6 +286,357 @@
for tags containing attributes and content.
</p>
+ <h2>Example: Tags with Attributes and Schema Inference</h2>
+
+ <p>
+ When the record's schema is not provided but inferred based on the
data itself, providing a value for the "Field Name for Content" property
+ is especially important. (For detailed information on schema
inference, see the "Schema Inference" section below.)
+ Let's focus on cases where an XML element (called
<code><field_with_attribute></code> in the examples) has an XML attribute
and some content and no sub-elements.
+ For the examples below, let's assume that a ConvertRecord processor is
used, and it uses an XMLReader controller service and an XMLRecordSetWriter
+ controller service. The settings for XMLReader are provided separately
for each example. The settings for XMLRecordSetWriter are common
+ for all the examples below. This way an XML to XML conversion is
executed and comparing the input data with the output highlights
+ the schema inference behavior. The same behavior can be observed if a
different Writer controller service is used.
+ XMLRecordSetWriter was chosen for these examples so that the input and
the output are easily comparable.
+ The settings of the common XMLRecordSetWriter are the following:
+ </p>
+
+ <table>
+ <tr>
+ <th>Property Name</th>
+ <th>Property Value</th>
+ </tr>
+ <tr>
+ <td>Schema Access Strategy</td>
+ <td><code>Inherit Record Schema</code></td>
+ </tr>
+ <tr>
+ <td>Suppress Null Values</td>
+ <td><code>Never Suppress</code></td>
+ </tr>
+ </table>
+
+ <h3>XML Attributes and Schema Inference Example 1</h3>
+
+ <p>
+ XMLReader settings:
+ </p>
+
+ <table>
+ <tr>
+ <th>Property Name</th>
+ <th>Property Value</th>
+ </tr>
+ <tr>
+ <td>Schema Access Strategy</td>
+ <td><code>Infer Schema</code></td>
+ </tr>
+ <tr>
+ <td>Expect Records as Array</td>
+ <td><code>false</code></td>
+ </tr>
+ <tr>
+ <td>Field Name for Content</td>
+ <td>not set</td>
+ </tr>
+ </table>
+
+ <p>
+ Input:
+ </p>
+
+ <code>
+ <pre>
+ <record>
+ <field_with_attribute attr="attr_content">
+ content of field
+ </field_with_attribute>
+ </record></pre>
+ </code>
+
+ <p>As mentioned above, the element called "field_with_attribute" has an
attribute and some content but no sub-element.</p>
+
+ <p>
+ Output:
+ </p>
+
+ <code>
+ <pre>
+ <record>
+ <field_with_attribute>
+ <attr>attr_content</attr>
+ <value></value>
+ </field_with_attribute>
+ </record></pre>
+ </code>
+
+ <p>
+ In the XMLReader's settings, no value is set for the "Field Name for
Content" property. In such cases the schema inference logic
+ adds a field named "value" to the schema. However, since "Field Name
for Content" is not set, the data processing logic is instructed
+ not to consider the original content of the parent XML tags
(<code><field_with_attribute></code> the content of which is "content of
field"
+ in the example). So a new field named "value" appears in the schema
but no value is assigned to it from the data, thus the field is empty.
+ The XML attribute (named "attr") is processed, a field named "attr" is
added to the schema and the attribute's value ("attr_content") is assigned to
it.
+ In a case like this, the parent field's original content is lost and a
new field named "value" appears in the schema with no data assigned to it.
+ This is to make sure that no data is overwritten in the record if it
already contains a field named "value". More on that case in Example 3 and
Example 4.
+ </p>
+
+ <h3>XML Attributes and Schema Inference Example 2</h3>
+
+ <p>
+ In this example, the XMLReader's "Field Name for Content" property is
filled with the value "original_content". The input data is the same as
+ in the previous example.
+ </p>
+
+ <p>
+ XMLReader settings:
+ </p>
+
+ <table>
+ <tr>
+ <th>Property Name</th>
+ <th>Property Value</th>
+ </tr>
+ <tr>
+ <td>Schema Access Strategy</td>
+ <td><code>Infer Schema</code></td>
+ </tr>
+ <tr>
+ <td>Expect Records as Array</td>
+ <td><code>false</code></td>
+ </tr>
+ <tr>
+ <td>Field Name for Content</td>
+ <td><code>original_content</code></td>
+ </tr>
+ </table>
+
+ <p>
+ Input:
+ </p>
+
+ <code>
+ <pre>
+ <record>
+ <field_with_attribute attr="attr_content">
+ content of field
+ </field_with_attribute>
+ </record></pre>
+ </code>
+
+ <p>
+ Output:
+ </p>
+
+ <code>
+ <pre>
+ <record>
+ <field_with_attribute>
+ <attr>attr_content</attr>
+ <original_content>content of
field</original_content>
+ </field_with_attribute>
+ </record></pre>
+ </code>
+
+ <p>
+ The XMLReader's "Field Name for Content" property contains the value
"original_content" (the concrete value is not important, what is important
+ is that a value is provided and it does not clash with the name of any
sub-element in <code><field_with_attribute></code>).
+ This explicitly tells the XMLReader controller service to create a
field named "original_content" and make the original content of
+ the parent XML tag the value of the field named "original_content".
Adding the XML attributed named "attr" works just like in the first example.
+ Since the <code><field_with_attribute></code> element had no
child-element with the name "original_content", no data is lost.
+ </p>
+
+ <h3>XML Attributes and Schema Inference Example 3</h3>
+
+ <p>
+ In this example, XMLReader's "Field Name for Content" property is left
empty. In the input data, the <code><field_with_attribute></code> element
+ has some content and a sub-element named <code><value></code>.
+ </p>
+
+ <p>
+ XMLReader settings:
+ </p>
+
+ <table>
+ <tr>
+ <th>Property Name</th>
+ <th>Property Value</th>
+ </tr>
+ <tr>
+ <td>Schema Access Strategy</td>
+ <td><code>Infer Schema</code></td>
+ </tr>
+ <tr>
+ <td>Expect Records as Array</td>
+ <td><code>false</code></td>
+ </tr>
+ <tr>
+ <td>Field Name for Content</td>
+ <td>not set</td>
+ </tr>
+ </table>
+
+ <p>
+ Input:
+ </p>
+
+ <code>
+ <pre>
+ <record>
+ <field_with_attribute attr="attr_content">
+ content of field<value>123</value>
+ </field_with_attribute>
+ </record></pre>
+ </code>
+
+ <p>
+ Output:
+ </p>
+
+ <code>
+ <pre>
+ <record>
+ <field_with_attribute>
+ <attr>attr_content</attr>
+ <value>123</value>
+ </field_with_attribute>
+ </record></pre>
+ </code>
+
+ <p>
+ The "Field Name for Content" property is not set, and the XML element
has a sub-element named "value". The name of the sub-element clashes with the
+ default field name added to the schema by the Schema Inference logic
(see Example 1). As seen in the output data, the input XML attribute's value
+ is added to the record just like in the previous examples. The value
of the <code><value></code> element is retained, but the content of the
+ <code><field_with_attribute></code> that was outside of the
sub-element, is lost.
+ </p>
+
+ <h3>XML Attributes and Schema Inference Example 4</h3>
+
+ <p>
+ In this example, XMLReader's "Field Name for Content" property is
given the value "value". In the input data, the
<code><field_with_attribute></code> element
+ has some content and a sub-element named <code><value></code>.
The name of the sub-element clashes with the value of the "Field Name for
Content" property.
+ </p>
+
+ <p>
+ XMLReader settings:
+ </p>
+
+ <table>
+ <tr>
+ <th>Property Name</th>
+ <th>Property Value</th>
+ </tr>
+ <tr>
+ <td>Schema Access Strategy</td>
+ <td><code>Infer Schema</code></td>
+ </tr>
+ <tr>
+ <td>Expect Records as Array</td>
+ <td><code>false</code></td>
+ </tr>
+ <tr>
+ <td>Field Name for Content</td>
+ <td><code>value</code></td>
+ </tr>
+ </table>
+
+ <p>
+ Input:
+ </p>
+
+ <code>
+ <pre>
+ <record>
+ <field_with_attribute attr="attr_content">
+ content of field<value>123</value>
+ </field_with_attribute>
+ </record></pre>
+ </code>
+
+ <p>
+ Output:
+ </p>
+
+ <code>
+ <pre>
+ <record>
+ <field_with_attribute>
+ <attr>attr_content</attr>
+ <value>content of field</value>
+ </field_with_attribute>
+ </record></pre>
+ </code>
+
+ <p>
+ The "Field Name for Content" property's value is "value", and the XML
element has a sub-element named "value". The name of the sub-element clashes
with the
+ value of the "Field Name for Content" property. The value of the
<code><value></code> element is replaced by the content of the
+ <code><field_with_attribute></code> element, and the original
content of the <code><value></code> element is lost.
+ </p>
+
+ <h3>XML Attributes and Schema Inference Example 5</h3>
+
+ <p>
+ To avoid losing any data, the XMLReader's "Field Name for Content"
property needs to be given a value that does not clash with any sub-element's
name
+ in the input data. In this example the input data is the same as in
the previous one, but the "Field Name for Content" property's value is
"original_content",
+ a value that does not clash with any sub-element name. No data is lost
in this case.
+ </p>
+
+ <p>
+ XMLReader settings:
+ </p>
+
+ <table>
+ <tr>
+ <th>Property Name</th>
+ <th>Property Value</th>
+ </tr>
+ <tr>
+ <td>Schema Access Strategy</td>
+ <td><code>Infer Schema</code></td>
+ </tr>
+ <tr>
+ <td>Expect Records as Array</td>
+ <td><code>false</code></td>
+ </tr>
+ <tr>
+ <td>Field Name for Content</td>
+ <td><code>original_content</code></td>
+ </tr>
+ </table>
+
+ <p>
+ Input:
+ </p>
+
+ <code>
+ <pre>
+ <record>
+ <field_with_attribute attr="attr_content">
+ content of field<value>123</value>
+ </field_with_attribute>
+ </record></pre>
+ </code>
+
+ <p>
+ Output:
+ </p>
+
+ <code>
+ <pre>
+ <record>
+ <field_with_attribute>
+ <attr>attr_content</attr>
+ <value>123</value>
+ <original_content>content of
field</original_content>
+ </field_with_attribute>
+ </record></pre>
+ </code>
+
+ <p>
+ It can be seen in the output data, that the attribute has been added
to the <code><field_with_attribute></code> element as a sub-element,
+ the <code><value></code> retained its value, and the original
content of the <code><field_with_attribute></code> element has been added
as a sub-element
+ named "original_content". This is because a value was chosen for the
"Field Name for Content" property that does not clash with any of
+ the existing sub-elements of the input XML element
(<code><field_with_attribute></code>). No data is lost.
+ </p>
+
<h2>Example: Array of records</h2>
<p>
diff --git
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestInferXmlSchema.java
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestInferXmlSchema.java
index b5bdd76025..2a4cd14f10 100644
---
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestInferXmlSchema.java
+++
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestInferXmlSchema.java
@@ -93,7 +93,8 @@ public class TestInferXmlSchema {
@Test
public void testStringFieldWithAttributes() throws IOException {
- final RecordSchema schema =
inferSchema("src/test/resources/xml/TextNodeWithAttribute.xml", true);
+ final String contentFieldName = "contentfield";
+ final RecordSchema schema =
inferSchema("src/test/resources/xml/TextNodeWithAttribute.xml",
contentFieldName, true);
assertEquals(3, schema.getFieldCount());
@@ -106,12 +107,16 @@ public class TestInferXmlSchema {
final RecordSchema childSchema = ((RecordDataType)
softwareDataType).getChildSchema();
assertSame(RecordFieldType.BOOLEAN,
childSchema.getDataType("favorite").get().getFieldType());
- assertSame(RecordFieldType.STRING,
childSchema.getDataType("value").get().getFieldType());
+ assertSame(RecordFieldType.STRING,
childSchema.getDataType(contentFieldName).get().getFieldType());
}
private RecordSchema inferSchema(final String filename, final boolean
ignoreWrapper) throws IOException {
+ return inferSchema(filename, "contentfield", ignoreWrapper);
+ }
+
+ private RecordSchema inferSchema(final String filename, final String
contentFieldName, final boolean ignoreWrapper) throws IOException {
final File file = new File(filename);
- final RecordSourceFactory<XmlNode> xmlSourceFactory = (var, in) ->
new XmlRecordSource(in, ignoreWrapper);
+ final RecordSourceFactory<XmlNode> xmlSourceFactory = (var, in) ->
new XmlRecordSource(in, contentFieldName, ignoreWrapper);
final SchemaInferenceEngine<XmlNode> schemaInference = new
XmlSchemaInference(timeValueInference);
final InferSchemaAccessStrategy<XmlNode> inferStrategy = new
InferSchemaAccessStrategy<>(xmlSourceFactory, schemaInference,
Mockito.mock(ComponentLog.class));
diff --git
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestXMLReader.java
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestXMLReader.java
index 5de7eac402..6904a6f599 100644
---
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestXMLReader.java
+++
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/xml/TestXMLReader.java
@@ -17,8 +17,10 @@
package org.apache.nifi.xml;
+import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.schema.access.SchemaAccessUtils;
+import org.apache.nifi.schema.inference.SchemaInferenceUtil;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
@@ -31,43 +33,48 @@ import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import static junit.framework.TestCase.assertEquals;
public class TestXMLReader {
- private XMLReader reader;
-
private final String ATTRIBUTE_PREFIX = "attribute_prefix";
private final String CONTENT_NAME = "content_field";
private final String EVALUATE_IS_ARRAY = "xml.stream.is.array";
- public TestRunner setup(String filePath) throws InitializationException,
IOException {
-
+ private TestRunner setup(Map<PropertyDescriptor, String>
xmlReaderProperties) throws InitializationException {
TestRunner runner =
TestRunners.newTestRunner(TestXMLReaderProcessor.class);
- reader = new XMLReader();
+ XMLReader reader = new XMLReader();
+
runner.addControllerService("xml_reader", reader);
runner.setProperty(TestXMLReaderProcessor.XML_READER, "xml_reader");
- final String outputSchemaText = new
String(Files.readAllBytes(Paths.get(filePath)));
- runner.setProperty(reader, SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY,
SchemaAccessUtils.SCHEMA_TEXT_PROPERTY);
- runner.setProperty(reader, SchemaAccessUtils.SCHEMA_TEXT,
outputSchemaText);
+ for (Map.Entry<PropertyDescriptor, String> entry :
xmlReaderProperties.entrySet()) {
+ runner.setProperty(reader, entry.getKey(), entry.getValue());
+ }
+ runner.enableControllerService(reader);
return runner;
}
@Test
- public void testRecordFormat() throws IOException, InitializationException
{
- TestRunner runner = setup("src/test/resources/xml/testschema");
+ public void testRecordFormatDeterminedBasedOnAttribute() throws
IOException, InitializationException {
+ String outputSchemaPath = "src/test/resources/xml/testschema";
+ String outputSchemaText = new
String(Files.readAllBytes(Paths.get(outputSchemaPath)));
- runner.setProperty(reader, XMLReader.RECORD_FORMAT,
XMLReader.RECORD_EVALUATE);
+ Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY,
SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT,
outputSchemaText);
+ xmlReaderProperties.put(XMLReader.RECORD_FORMAT,
XMLReader.RECORD_EVALUATE.getValue());
+ TestRunner runner = setup(xmlReaderProperties);
- runner.enableControllerService(reader);
-
- InputStream is = new
FileInputStream("src/test/resources/xml/people.xml");
- runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY,
"true"));
- runner.run();
+ try (InputStream is = new
FileInputStream("src/test/resources/xml/people.xml")) {
+ runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY,
"true"));
+ runner.run();
+ }
List<MockFlowFile> flowFile =
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
List<String> records = Arrays.asList((new
String(runner.getContentAsByteArray(flowFile.get(0)))).split("\n"));
@@ -76,16 +83,20 @@ public class TestXMLReader {
}
@Test
- public void testRecordFormat2() throws IOException,
InitializationException {
- TestRunner runner = setup("src/test/resources/xml/testschema");
+ public void testRecordFormatArray() throws IOException,
InitializationException {
+ String outputSchemaPath = "src/test/resources/xml/testschema";
+ String outputSchemaText = new
String(Files.readAllBytes(Paths.get(outputSchemaPath)));
- runner.setProperty(reader, XMLReader.RECORD_FORMAT,
XMLReader.RECORD_ARRAY);
-
- runner.enableControllerService(reader);
+ Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY,
SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT,
outputSchemaText);
+ xmlReaderProperties.put(XMLReader.RECORD_FORMAT,
XMLReader.RECORD_ARRAY.getValue());
+ TestRunner runner = setup(xmlReaderProperties);
- InputStream is = new
FileInputStream("src/test/resources/xml/people.xml");
- runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY,
"true"));
- runner.run();
+ try (InputStream is = new
FileInputStream("src/test/resources/xml/people.xml")) {
+ runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY,
"true"));
+ runner.run();
+ }
List<MockFlowFile> flowFile =
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
List<String> records = Arrays.asList((new
String(runner.getContentAsByteArray(flowFile.get(0)))).split("\n"));
@@ -94,16 +105,20 @@ public class TestXMLReader {
}
@Test
- public void testRecordFormat3() throws IOException,
InitializationException {
- TestRunner runner = setup("src/test/resources/xml/testschema");
+ public void testRecordFormatNotArray() throws IOException,
InitializationException {
+ String outputSchemaPath = "src/test/resources/xml/testschema";
+ String outputSchemaText = new
String(Files.readAllBytes(Paths.get(outputSchemaPath)));
- runner.setProperty(reader, XMLReader.RECORD_FORMAT,
XMLReader.RECORD_SINGLE);
+ Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY,
SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT,
outputSchemaText);
+ xmlReaderProperties.put(XMLReader.RECORD_FORMAT,
XMLReader.RECORD_SINGLE.getValue());
+ TestRunner runner = setup(xmlReaderProperties);
- runner.enableControllerService(reader);
-
- InputStream is = new
FileInputStream("src/test/resources/xml/person.xml");
- runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY,
"true"));
- runner.run();
+ try (InputStream is = new
FileInputStream("src/test/resources/xml/person.xml")) {
+ runner.enqueue(is, Collections.singletonMap(EVALUATE_IS_ARRAY,
"true"));
+ runner.run();
+ }
List<MockFlowFile> flowFile =
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
List<String> records = Arrays.asList(new
String(runner.getContentAsByteArray(flowFile.get(0))).split("\n"));
@@ -113,16 +128,20 @@ public class TestXMLReader {
@Test
public void testAttributePrefix() throws IOException,
InitializationException {
- TestRunner runner = setup("src/test/resources/xml/testschema");
+ String outputSchemaPath = "src/test/resources/xml/testschema";
+ String outputSchemaText = new
String(Files.readAllBytes(Paths.get(outputSchemaPath)));
- runner.setProperty(reader, XMLReader.ATTRIBUTE_PREFIX, "${" +
ATTRIBUTE_PREFIX + "}");
- runner.setProperty(reader, XMLReader.RECORD_FORMAT,
XMLReader.RECORD_ARRAY);
-
- runner.enableControllerService(reader);
+ Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY,
SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT,
outputSchemaText);
+ xmlReaderProperties.put(XMLReader.ATTRIBUTE_PREFIX, "${" +
ATTRIBUTE_PREFIX + "}");
+ xmlReaderProperties.put(XMLReader.RECORD_FORMAT,
XMLReader.RECORD_ARRAY.getValue());
+ TestRunner runner = setup(xmlReaderProperties);
- InputStream is = new
FileInputStream("src/test/resources/xml/people.xml");
- runner.enqueue(is, Collections.singletonMap(ATTRIBUTE_PREFIX,
"ATTR_"));
- runner.run();
+ try (InputStream is = new
FileInputStream("src/test/resources/xml/people.xml")) {
+ runner.enqueue(is, Collections.singletonMap(ATTRIBUTE_PREFIX,
"ATTR_"));
+ runner.run();
+ }
List<MockFlowFile> flowFile =
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
List<String> records = Arrays.asList(new
String(runner.getContentAsByteArray(flowFile.get(0))).split("\n"));
@@ -136,16 +155,20 @@ public class TestXMLReader {
@Test
public void testContentField() throws IOException, InitializationException
{
- TestRunner runner = setup("src/test/resources/xml/testschema2");
+ String outputSchemaPath = "src/test/resources/xml/testschema2";
+ String outputSchemaText = new
String(Files.readAllBytes(Paths.get(outputSchemaPath)));
- runner.setProperty(reader, XMLReader.CONTENT_FIELD_NAME, "${" +
CONTENT_NAME + "}");
- runner.setProperty(reader, XMLReader.RECORD_FORMAT,
XMLReader.RECORD_ARRAY);
+ Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY,
SchemaAccessUtils.SCHEMA_TEXT_PROPERTY.getValue());
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_TEXT,
outputSchemaText);
+ xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, "${" +
CONTENT_NAME + "}");
+ xmlReaderProperties.put(XMLReader.RECORD_FORMAT,
XMLReader.RECORD_ARRAY.getValue());
+ TestRunner runner = setup(xmlReaderProperties);
- runner.enableControllerService(reader);
-
- InputStream is = new
FileInputStream("src/test/resources/xml/people_tag_in_characters.xml");
- runner.enqueue(is, Collections.singletonMap(CONTENT_NAME, "CONTENT"));
- runner.run();
+ try (InputStream is = new
FileInputStream("src/test/resources/xml/people_tag_in_characters.xml")) {
+ runner.enqueue(is, Collections.singletonMap(CONTENT_NAME,
"CONTENT"));
+ runner.run();
+ }
List<MockFlowFile> flowFile =
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS);
List<String> records = Arrays.asList(new
String(runner.getContentAsByteArray(flowFile.get(0))).split("\n"));
@@ -157,4 +180,103 @@ public class TestXMLReader {
assertEquals("MapRecord[{ID=P4, NAME=MapRecord[{CONTENT=Elenora
Scrivens, ATTR=attr content, INNER=inner content}], AGE=16}]", records.get(3));
assertEquals("MapRecord[{ID=P5, NAME=MapRecord[{INNER=inner
content}]}]", records.get(4));
}
+
+ @Test
+ public void testInferSchema() throws InitializationException, IOException {
+ String expectedContent = "MapRecord[{software=MapRecord[{" +
CONTENT_NAME + "=Apache NiFi, favorite=true}], num=123, name=John Doe}]";
+
+ Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY,
SchemaInferenceUtil.INFER_SCHEMA.getValue());
+ xmlReaderProperties.put(XMLReader.RECORD_FORMAT,
XMLReader.RECORD_SINGLE.getValue());
+ xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, CONTENT_NAME);
+ TestRunner runner = setup(xmlReaderProperties);
+
+ try (InputStream is = new
FileInputStream("src/test/resources/xml/person_record.xml")) {
+ runner.enqueue(is);
+ runner.run();
+ }
+
+ MockFlowFile out =
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
+ String actualContent = out.getContent();
+ assertEquals(expectedContent, actualContent);
+ }
+
+ @Test
+ public void testInferSchemaContentFieldNameNotSet() throws
InitializationException, IOException {
+ String expectedContent =
"MapRecord[{software=MapRecord[{favorite=true}], num=123, name=John Doe}]";
+
+ Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY,
SchemaInferenceUtil.INFER_SCHEMA.getValue());
+ xmlReaderProperties.put(XMLReader.RECORD_FORMAT,
XMLReader.RECORD_SINGLE.getValue());
+ TestRunner runner = setup(xmlReaderProperties);
+
+ try (InputStream is = new
FileInputStream("src/test/resources/xml/person_record.xml")) {
+ runner.enqueue(is);
+ runner.run();
+ }
+
+ MockFlowFile out =
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
+ String actualContent = out.getContent();
+ assertEquals(expectedContent, actualContent);
+ }
+
+ @Test
+ public void testInferSchemaContentFieldNameNotSetSubElementExists() throws
InitializationException, IOException {
+ String expectedContent =
"MapRecord[{field_with_attribute=MapRecord[{attr=attr_content, value=123}]}]";
+
+ Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY,
SchemaInferenceUtil.INFER_SCHEMA.getValue());
+ xmlReaderProperties.put(XMLReader.RECORD_FORMAT,
XMLReader.RECORD_SINGLE.getValue());
+ TestRunner runner = setup(xmlReaderProperties);
+
+ try (InputStream is = new
FileInputStream("src/test/resources/xml/field_with_sub-element.xml")) {
+ runner.enqueue(is);
+ runner.run();
+ }
+
+ MockFlowFile out =
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
+ String actualContent = out.getContent();
+ assertEquals(expectedContent, actualContent);
+ }
+
+ @Test
+ public void testInferSchemaContentFieldNameSetSubElementExistsNameClash()
throws InitializationException, IOException {
+ String expectedContent =
"MapRecord[{field_with_attribute=MapRecord[{attr=attr_content, value=content of
field}]}]";
+
+ Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY,
SchemaInferenceUtil.INFER_SCHEMA.getValue());
+ xmlReaderProperties.put(XMLReader.RECORD_FORMAT,
XMLReader.RECORD_SINGLE.getValue());
+ xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, "value");
+ TestRunner runner = setup(xmlReaderProperties);
+
+ try (InputStream is = new
FileInputStream("src/test/resources/xml/field_with_sub-element.xml")) {
+ runner.enqueue(is);
+ runner.run();
+ }
+
+ MockFlowFile out =
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
+ String actualContent = out.getContent();
+ assertEquals(expectedContent, actualContent);
+ }
+
+ @Test
+ public void
testInferSchemaContentFieldNameSetSubElementExistsNoNameClash() throws
InitializationException, IOException {
+ String expectedContent = "MapRecord[{field_with_attribute=MapRecord[{"
+CONTENT_NAME + "=content of field, " +
+ "attr=attr_content, value=123}]}]";
+
+ Map<PropertyDescriptor, String> xmlReaderProperties = new HashMap<>();
+ xmlReaderProperties.put(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY,
SchemaInferenceUtil.INFER_SCHEMA.getValue());
+ xmlReaderProperties.put(XMLReader.RECORD_FORMAT,
XMLReader.RECORD_SINGLE.getValue());
+ xmlReaderProperties.put(XMLReader.CONTENT_FIELD_NAME, CONTENT_NAME);
+ TestRunner runner = setup(xmlReaderProperties);
+
+ try (InputStream is = new
FileInputStream("src/test/resources/xml/field_with_sub-element.xml")) {
+ runner.enqueue(is);
+ runner.run();
+ }
+
+ MockFlowFile out =
runner.getFlowFilesForRelationship(TestXMLReaderProcessor.SUCCESS).get(0);
+ String actualContent = out.getContent();
+ assertEquals(expectedContent, actualContent);
+ }
}
diff --git
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/field_with_sub-element.xml
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/field_with_sub-element.xml
new file mode 100644
index 0000000000..2c9146119d
--- /dev/null
+++
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/field_with_sub-element.xml
@@ -0,0 +1,4 @@
+<record>
+ <field_with_attribute attr="attr_content">content of
field<value>123</value>
+ </field_with_attribute>
+</record>
\ No newline at end of file
diff --git
a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/person_record.xml
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/person_record.xml
new file mode 100644
index 0000000000..08b39093e0
--- /dev/null
+++
b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/xml/person_record.xml
@@ -0,0 +1,5 @@
+<record>
+ <num>123</num>
+ <name>John Doe</name>
+ <software favorite="true">Apache NiFi</software>
+</record>
\ No newline at end of file