Repository: nifi
Updated Branches:
  refs/heads/master 2bc7d5262 -> 41ad03215


NIFI-3055 StandardRecordWriter Can Throw UTFDataFormatException (1.x)
* Remove function based on JDK source.
* Add new function to find bytes based on RFC3629.
* Add field name to log entry when field is truncated.

Signed-off-by: Mike Moser <[email protected]>
This closes #1475


Project: http://git-wip-us.apache.org/repos/asf/nifi/repo
Commit: http://git-wip-us.apache.org/repos/asf/nifi/commit/41ad0321
Tree: http://git-wip-us.apache.org/repos/asf/nifi/tree/41ad0321
Diff: http://git-wip-us.apache.org/repos/asf/nifi/diff/41ad0321

Branch: refs/heads/master
Commit: 41ad032151ffaf0b56b18400be61e12e2742a58d
Parents: 2bc7d52
Author: Joe Skora <[email protected]>
Authored: Mon Feb 6 18:55:01 2017 +0000
Committer: Mike Moser <[email protected]>
Committed: Mon Feb 13 20:15:59 2017 +0000

----------------------------------------------------------------------
 .../repository/schema/SchemaRecordWriter.java   | 47 +++++------
 .../schema/TestSchemaRecordReaderWriter.java    | 84 ++++++++++----------
 .../nifi/provenance/StandardRecordWriter.java   | 82 +++++++++----------
 3 files changed, 108 insertions(+), 105 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nifi/blob/41ad0321/nifi-commons/nifi-schema-utils/src/main/java/org/apache/nifi/repository/schema/SchemaRecordWriter.java
----------------------------------------------------------------------
diff --git 
a/nifi-commons/nifi-schema-utils/src/main/java/org/apache/nifi/repository/schema/SchemaRecordWriter.java
 
b/nifi-commons/nifi-schema-utils/src/main/java/org/apache/nifi/repository/schema/SchemaRecordWriter.java
index 81043bc..3e4a059 100644
--- 
a/nifi-commons/nifi-schema-utils/src/main/java/org/apache/nifi/repository/schema/SchemaRecordWriter.java
+++ 
b/nifi-commons/nifi-schema-utils/src/main/java/org/apache/nifi/repository/schema/SchemaRecordWriter.java
@@ -113,7 +113,7 @@ public class SchemaRecordWriter {
                 out.writeLong((Long) value);
                 break;
             case STRING:
-                writeUTFLimited(out, (String) value);
+                writeUTFLimited(out, (String) value, field.getFieldName());
                 break;
             case LONG_STRING:
                 final byte[] charArray = ((String) 
value).getBytes(StandardCharsets.UTF_8);
@@ -134,7 +134,7 @@ public class SchemaRecordWriter {
                 break;
             case UNION:
                 final NamedValue namedValue = (NamedValue) value;
-                writeUTFLimited(out, namedValue.getName());
+                writeUTFLimited(out, namedValue.getName(), 
field.getFieldName());
                 final Record childRecord = (Record) namedValue.getValue();
                 writeRecordFields(childRecord, out);
                 break;
@@ -145,14 +145,14 @@ public class SchemaRecordWriter {
         }
     }
 
-    private void writeUTFLimited(final DataOutputStream out, final String 
utfString) throws IOException {
+    private void writeUTFLimited(final DataOutputStream out, final String 
utfString, final String fieldName) throws IOException {
         try {
             out.writeUTF(utfString);
         } catch (UTFDataFormatException e) {
-            final String truncated = utfString.substring(0, 
getCharsInUTFLength(utfString, MAX_ALLOWED_UTF_LENGTH));
-            logger.warn("Truncating repository record value!  Attempted to 
write {} chars that encode to a UTF byte length greater than "
+            final String truncated = utfString.substring(0, 
getCharsInUTF8Limit(utfString, MAX_ALLOWED_UTF_LENGTH));
+            logger.warn("Truncating repository record value for field '{}'!  
Attempted to write {} chars that encode to a UTF8 byte length greater than "
                             + "supported maximum ({}), truncating to {} 
chars.",
-                    utfString.length(), MAX_ALLOWED_UTF_LENGTH, 
truncated.length());
+                    (fieldName == null) ? "" : fieldName, utfString.length(), 
MAX_ALLOWED_UTF_LENGTH, truncated.length());
             if (logger.isDebugEnabled()) {
                 logger.warn("String value was:\n{}", truncated);
             }
@@ -160,28 +160,29 @@ public class SchemaRecordWriter {
         }
     }
 
-
-    static int getCharsInUTFLength(final String str, final int utfLimit) {
-        // see java.io.DataOutputStream.writeUTF()
-        int strlen = str.length();
-        int utflen = 0;
-        int c;
-
-        /* use charAt instead of copying String to Char array */
-        for (int i = 0; i < strlen; i++) {
-            c = str.charAt(i);
-            if ((c >= 0x0001) & (c <= 0x007F)) {
-                utflen++;
-            } else if (c > 0x07FF) {
-                utflen += 3;
+    static int getCharsInUTF8Limit(final String str, final int utf8Limit) {
+        // Calculate how much of String fits within UTF8 byte limit based on 
RFC3629.
+        //
+        // Java String values use char[] for storage, so character values 
>0xFFFF that
+        // map to 4 byte UTF8 representations are not considered.
+
+        final int charsInOriginal = str.length();
+        int bytesInUTF8 = 0;
+
+        for (int i = 0; i < charsInOriginal; i++) {
+            final int curr = str.charAt(i);
+            if (curr < 0x0080) {
+                bytesInUTF8++;
+            } else if (curr < 0x0800) {
+                bytesInUTF8 += 2;
             } else {
-                utflen += 2;
+                bytesInUTF8 += 3;
             }
-            if (utflen > utfLimit) {
+            if (bytesInUTF8 > utf8Limit) {
                 return i;
             }
         }
-        return strlen;
+        return charsInOriginal;
     }
 
 }

http://git-wip-us.apache.org/repos/asf/nifi/blob/41ad0321/nifi-commons/nifi-schema-utils/src/test/java/org/apache/nifi/repository/schema/TestSchemaRecordReaderWriter.java
----------------------------------------------------------------------
diff --git 
a/nifi-commons/nifi-schema-utils/src/test/java/org/apache/nifi/repository/schema/TestSchemaRecordReaderWriter.java
 
b/nifi-commons/nifi-schema-utils/src/test/java/org/apache/nifi/repository/schema/TestSchemaRecordReaderWriter.java
index 5eb815a..5dfd40e 100644
--- 
a/nifi-commons/nifi-schema-utils/src/test/java/org/apache/nifi/repository/schema/TestSchemaRecordReaderWriter.java
+++ 
b/nifi-commons/nifi-schema-utils/src/test/java/org/apache/nifi/repository/schema/TestSchemaRecordReaderWriter.java
@@ -235,52 +235,52 @@ public class TestSchemaRecordReaderWriter {
     }
 
     @Test
-    public void testSingleCharUTFLengths() {
-        // verify handling of single characters mapping to 1, 2, and 3 utf 
byte strings
-        assertEquals("test 1 char string truncated to 0 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringOneByte, 0));
-        assertEquals("test 2 char string truncated to 0 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringTwoByte, 0));
-        assertEquals("test 3 char string truncated to 0 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringThreeByte, 0));
-        assertEquals("test 1 char string truncated to 1 utf bytes should be 
1", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringOneByte, 1));
-        assertEquals("test 2 char string truncated to 1 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringTwoByte, 1));
-        assertEquals("test 3 char string truncated to 1 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringThreeByte, 1));
-        assertEquals("test 1 char string truncated to 2 utf bytes should be 
1", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringOneByte, 2));
-        assertEquals("test 2 char string truncated to 2 utf bytes should be 
2", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringTwoByte, 2));
-        assertEquals("test 3 char string truncated to 2 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringThreeByte, 2));
-        assertEquals("test 1 char string truncated to 3 utf bytes should be 
1", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringOneByte, 3));
-        assertEquals("test 2 char string truncated to 3 utf bytes should be 
2", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringTwoByte, 3));
-        assertEquals("test 3 char string truncated to 3 utf bytes should be 
3", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringThreeByte, 3));
+    public void testSingleCharUTF8Lengths() {
+        // verify handling of single characters mapping to utf8 byte strings
+        assertEquals("test 1 char string truncated to 0 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 0));
+        assertEquals("test 2 char string truncated to 0 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 0));
+        assertEquals("test 3 char string truncated to 0 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 0));
+        assertEquals("test 1 char string truncated to 1 utf bytes should be 
1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 1));
+        assertEquals("test 2 char string truncated to 1 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 1));
+        assertEquals("test 3 char string truncated to 1 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 1));
+        assertEquals("test 1 char string truncated to 2 utf bytes should be 
1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 2));
+        assertEquals("test 2 char string truncated to 2 utf bytes should be 
2", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 2));
+        assertEquals("test 3 char string truncated to 2 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 2));
+        assertEquals("test 1 char string truncated to 3 utf bytes should be 
1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 3));
+        assertEquals("test 2 char string truncated to 3 utf bytes should be 
2", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 3));
+        assertEquals("test 3 char string truncated to 3 utf bytes should be 
3", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 3));
     }
 
     @Test
     public void testMultiCharUTFLengths() {
         // test boundary conditions as 1, 2, and 3 UTF byte chars are included 
into utf limit                                                  positions used 
by strings
         final String testString1 = utfStringOneByte + utfStringTwoByte + 
utfStringThreeByte;                                                // char 
'abc' utf 'abbccc'
-        assertEquals("test 6 char string truncated to 0 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTFLength(testString1, 0)); //            
utf ''
-        assertEquals("test 6 char string truncated to 1 utf bytes should be 
1", 1, SchemaRecordWriter.getCharsInUTFLength(testString1, 1)); //            
utf 'a'
-        assertEquals("test 6 char string truncated to 2 utf bytes should be 
1", 1, SchemaRecordWriter.getCharsInUTFLength(testString1, 2)); //            
utf 'a'
-        assertEquals("test 6 char string truncated to 3 utf bytes should be 
2", 2, SchemaRecordWriter.getCharsInUTFLength(testString1, 3)); //            
utf 'abb'
-        assertEquals("test 6 char string truncated to 4 utf bytes should be 
2", 2, SchemaRecordWriter.getCharsInUTFLength(testString1, 4)); //            
utf 'abb'
-        assertEquals("test 6 char string truncated to 5 utf bytes should be 
2", 2, SchemaRecordWriter.getCharsInUTFLength(testString1, 5)); //            
utf 'abb'
-        assertEquals("test 6 char string truncated to 6 utf bytes should be 
3", 3, SchemaRecordWriter.getCharsInUTFLength(testString1, 6)); //            
utf 'abbccc'
+        assertEquals("test 6 char string truncated to 0 utf bytes should be 
0", 0, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 0)); //            
utf ''
+        assertEquals("test 6 char string truncated to 1 utf bytes should be 
1", 1, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 1)); //            
utf 'a'
+        assertEquals("test 6 char string truncated to 2 utf bytes should be 
1", 1, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 2)); //            
utf 'a'
+        assertEquals("test 6 char string truncated to 3 utf bytes should be 
2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 3)); //            
utf 'abb'
+        assertEquals("test 6 char string truncated to 4 utf bytes should be 
2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 4)); //            
utf 'abb'
+        assertEquals("test 6 char string truncated to 5 utf bytes should be 
2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 5)); //            
utf 'abb'
+        assertEquals("test 6 char string truncated to 6 utf bytes should be 
3", 3, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 6)); //            
utf 'abbccc'
     }
 
     @Test
     public void testSmallCharUTFLengths() throws UnsupportedEncodingException {
         final String string12b = StringUtils.repeat(utfStringOneByte + 
utfStringTwoByte + utfStringThreeByte, 2);
 
-        assertEquals("test multi-char string truncated to  0 utf bytes should 
be 0", 0, SchemaRecordWriter.getCharsInUTFLength(string12b,  0));
-        assertEquals("test multi-char string truncated to  1 utf bytes should 
be 0", 1, SchemaRecordWriter.getCharsInUTFLength(string12b,  1));
-        assertEquals("test multi-char string truncated to  2 utf bytes should 
be 0", 1, SchemaRecordWriter.getCharsInUTFLength(string12b,  2));
-        assertEquals("test multi-char string truncated to  3 utf bytes should 
be 0", 2, SchemaRecordWriter.getCharsInUTFLength(string12b,  3));
-        assertEquals("test multi-char string truncated to  4 utf bytes should 
be 0", 2, SchemaRecordWriter.getCharsInUTFLength(string12b,  4));
-        assertEquals("test multi-char string truncated to  5 utf bytes should 
be 0", 2, SchemaRecordWriter.getCharsInUTFLength(string12b,  5));
-        assertEquals("test multi-char string truncated to  6 utf bytes should 
be 0", 3, SchemaRecordWriter.getCharsInUTFLength(string12b,  6));
-        assertEquals("test multi-char string truncated to  7 utf bytes should 
be 0", 4, SchemaRecordWriter.getCharsInUTFLength(string12b,  7));
-        assertEquals("test multi-char string truncated to  8 utf bytes should 
be 0", 4, SchemaRecordWriter.getCharsInUTFLength(string12b,  8));
-        assertEquals("test multi-char string truncated to  9 utf bytes should 
be 0", 5, SchemaRecordWriter.getCharsInUTFLength(string12b,  9));
-        assertEquals("test multi-char string truncated to 10 utf bytes should 
be 0", 5, SchemaRecordWriter.getCharsInUTFLength(string12b, 10));
-        assertEquals("test multi-char string truncated to 11 utf bytes should 
be 0", 5, SchemaRecordWriter.getCharsInUTFLength(string12b, 11));
-        assertEquals("test multi-char string truncated to 12 utf bytes should 
be 0", 6, SchemaRecordWriter.getCharsInUTFLength(string12b, 12));
+        assertEquals("test multi-char string truncated to  0 utf bytes should 
be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  0));
+        assertEquals("test multi-char string truncated to  1 utf bytes should 
be 0", 1, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  1));
+        assertEquals("test multi-char string truncated to  2 utf bytes should 
be 0", 1, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  2));
+        assertEquals("test multi-char string truncated to  3 utf bytes should 
be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  3));
+        assertEquals("test multi-char string truncated to  4 utf bytes should 
be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  4));
+        assertEquals("test multi-char string truncated to  5 utf bytes should 
be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  5));
+        assertEquals("test multi-char string truncated to  6 utf bytes should 
be 0", 3, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  6));
+        assertEquals("test multi-char string truncated to  7 utf bytes should 
be 0", 4, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  7));
+        assertEquals("test multi-char string truncated to  8 utf bytes should 
be 0", 4, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  8));
+        assertEquals("test multi-char string truncated to  9 utf bytes should 
be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  9));
+        assertEquals("test multi-char string truncated to 10 utf bytes should 
be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 10));
+        assertEquals("test multi-char string truncated to 11 utf bytes should 
be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 11));
+        assertEquals("test multi-char string truncated to 12 utf bytes should 
be 0", 6, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 12));
     }
 
     @Test
@@ -290,16 +290,16 @@ public class TestSchemaRecordReaderWriter {
         assertEquals("test 64k char string should be 64k chars long", 65535, 
string64k.length());
 
         // drop half the chars going to utf of 64k bytes -- (1+1+1) * 21845 = 
65535 chars which converts to (1+2+3) * 21845 = 131070 utf bytes so 1/2 is 
truncated
-        assertEquals("test 64k char string truncated to 65,535 utf bytes 
should be 32768", 32768, SchemaRecordWriter.getCharsInUTFLength(string64k, 
65535));
+        assertEquals("test 64k char string truncated to 65,535 utf bytes 
should be 32768", 32768, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 
65535));
 
         // dropping bytes off the end of utf length
-        assertEquals("test 64k char string truncated to 65,534 utf bytes 
should be 32767", 32767, SchemaRecordWriter.getCharsInUTFLength(string64k, 
65534)); // lost 2 byte char
-        assertEquals("test 64k char string truncated to 65,533 utf bytes 
should be 32767", 32767, SchemaRecordWriter.getCharsInUTFLength(string64k, 
65533));
-        assertEquals("test 64k char string truncated to 65,532 utf bytes 
should be 32766", 32766, SchemaRecordWriter.getCharsInUTFLength(string64k, 
65532)); // lost 1 byte char
-        assertEquals("test 64k char string truncated to 65,531 utf bytes 
should be 32765", 32765, SchemaRecordWriter.getCharsInUTFLength(string64k, 
65531)); // lost 3 byte char
-        assertEquals("test 64k char string truncated to 65,530 utf bytes 
should be 32765", 32765, SchemaRecordWriter.getCharsInUTFLength(string64k, 
65530));
-        assertEquals("test 64k char string truncated to 65,529 utf bytes 
should be 32765", 32765, SchemaRecordWriter.getCharsInUTFLength(string64k, 
65529));
-        assertEquals("test 64k char string truncated to 65,528 utf bytes 
should be 32764", 32764, SchemaRecordWriter.getCharsInUTFLength(string64k, 
65528)); // lost 2 byte char (again)
+        assertEquals("test 64k char string truncated to 65,534 utf bytes 
should be 32767", 32767, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 
65534)); // lost 2 byte char
+        assertEquals("test 64k char string truncated to 65,533 utf bytes 
should be 32767", 32767, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 
65533));
+        assertEquals("test 64k char string truncated to 65,532 utf bytes 
should be 32766", 32766, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 
65532)); // lost 1 byte char
+        assertEquals("test 64k char string truncated to 65,531 utf bytes 
should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 
65531)); // lost 3 byte char
+        assertEquals("test 64k char string truncated to 65,530 utf bytes 
should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 
65530));
+        assertEquals("test 64k char string truncated to 65,529 utf bytes 
should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 
65529));
+        assertEquals("test 64k char string truncated to 65,528 utf bytes 
should be 32764", 32764, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 
65528)); // lost 2 byte char (again)
     }
 
     private SimpleRecordField createField(final String fieldName, final 
FieldType type) {

http://git-wip-us.apache.org/repos/asf/nifi/blob/41ad0321/nifi-nar-bundles/nifi-provenance-repository-bundle/nifi-persistent-provenance-repository/src/main/java/org/apache/nifi/provenance/StandardRecordWriter.java
----------------------------------------------------------------------
diff --git 
a/nifi-nar-bundles/nifi-provenance-repository-bundle/nifi-persistent-provenance-repository/src/main/java/org/apache/nifi/provenance/StandardRecordWriter.java
 
b/nifi-nar-bundles/nifi-provenance-repository-bundle/nifi-persistent-provenance-repository/src/main/java/org/apache/nifi/provenance/StandardRecordWriter.java
index 076e507..4696767 100644
--- 
a/nifi-nar-bundles/nifi-provenance-repository-bundle/nifi-persistent-provenance-repository/src/main/java/org/apache/nifi/provenance/StandardRecordWriter.java
+++ 
b/nifi-nar-bundles/nifi-provenance-repository-bundle/nifi-persistent-provenance-repository/src/main/java/org/apache/nifi/provenance/StandardRecordWriter.java
@@ -76,16 +76,16 @@ public class StandardRecordWriter extends 
CompressableRecordWriter implements Re
         final ProvenanceEventType recordType = record.getEventType();
 
         out.writeLong(recordIdentifier);
-        writeUTFLimited(out, record.getEventType().name());
+        writeUTFLimited(out, record.getEventType().name(), "EventType");
         out.writeLong(record.getEventTime());
         out.writeLong(record.getFlowFileEntryDate());
         out.writeLong(record.getEventDuration());
         out.writeLong(record.getLineageStartDate());
 
-        writeNullableString(out, record.getComponentId());
-        writeNullableString(out, record.getComponentType());
+        writeNullableString(out, record.getComponentId(), "ComponentId");
+        writeNullableString(out, record.getComponentType(), "ComponentType");
         writeUUID(out, record.getFlowFileUuid());
-        writeNullableString(out, record.getDetails());
+        writeNullableString(out, record.getDetails(), "Details");
 
         // Write FlowFile attributes
         final Map<String, String> attrs = record.getPreviousAttributes();
@@ -105,9 +105,9 @@ public class StandardRecordWriter extends 
CompressableRecordWriter implements Re
         // If Content Claim Info is present, write out a 'TRUE' followed by 
claim info. Else, write out 'false'.
         if (record.getContentClaimSection() != null && 
record.getContentClaimContainer() != null && record.getContentClaimIdentifier() 
!= null) {
             out.writeBoolean(true);
-            writeUTFLimited(out, record.getContentClaimContainer());
-            writeUTFLimited(out, record.getContentClaimSection());
-            writeUTFLimited(out, record.getContentClaimIdentifier());
+            writeUTFLimited(out, record.getContentClaimContainer(), 
"ContentClaimContainer");
+            writeUTFLimited(out, record.getContentClaimSection(), 
"ContentClaimSection");
+            writeUTFLimited(out, record.getContentClaimIdentifier(), 
"ContentClaimIdentifier");
             if (record.getContentClaimOffset() == null) {
                 out.writeLong(0L);
             } else {
@@ -121,9 +121,9 @@ public class StandardRecordWriter extends 
CompressableRecordWriter implements Re
         // If Previous Content Claim Info is present, write out a 'TRUE' 
followed by claim info. Else, write out 'false'.
         if (record.getPreviousContentClaimSection() != null && 
record.getPreviousContentClaimContainer() != null && 
record.getPreviousContentClaimIdentifier() != null) {
             out.writeBoolean(true);
-            writeUTFLimited(out, record.getPreviousContentClaimContainer());
-            writeUTFLimited(out, record.getPreviousContentClaimSection());
-            writeUTFLimited(out, record.getPreviousContentClaimIdentifier());
+            writeUTFLimited(out, record.getPreviousContentClaimContainer(), 
"PreviousContentClaimContainer");
+            writeUTFLimited(out, record.getPreviousContentClaimSection(), 
"PreviousContentClaimSection");
+            writeUTFLimited(out, record.getPreviousContentClaimIdentifier(), 
"PreviousContentClaimIdentifier");
             if (record.getPreviousContentClaimOffset() == null) {
                 out.writeLong(0L);
             } else {
@@ -140,28 +140,28 @@ public class StandardRecordWriter extends 
CompressableRecordWriter implements Re
         }
 
         // write out the identifier of the destination queue.
-        writeNullableString(out, record.getSourceQueueIdentifier());
+        writeNullableString(out, record.getSourceQueueIdentifier(), 
"SourceQueueIdentifier");
 
         // Write type-specific info
         if (recordType == ProvenanceEventType.FORK || recordType == 
ProvenanceEventType.JOIN || recordType == ProvenanceEventType.CLONE || 
recordType == ProvenanceEventType.REPLAY) {
             writeUUIDs(out, record.getParentUuids());
             writeUUIDs(out, record.getChildUuids());
         } else if (recordType == ProvenanceEventType.RECEIVE) {
-            writeNullableString(out, record.getTransitUri());
-            writeNullableString(out, 
record.getSourceSystemFlowFileIdentifier());
+            writeNullableString(out, record.getTransitUri(), "TransitUri");
+            writeNullableString(out, 
record.getSourceSystemFlowFileIdentifier(), "SourceSystemFlowFileIdentifier");
         } else if (recordType == ProvenanceEventType.FETCH) {
-            writeNullableString(out, record.getTransitUri());
+            writeNullableString(out, record.getTransitUri(), "TransitUri");
         } else if (recordType == ProvenanceEventType.SEND) {
-            writeNullableString(out, record.getTransitUri());
+            writeNullableString(out, record.getTransitUri(), "TransitUri");
         } else if (recordType == ProvenanceEventType.ADDINFO) {
-            writeNullableString(out, record.getAlternateIdentifierUri());
+            writeNullableString(out, record.getAlternateIdentifierUri(), 
"AlternateIdentifierUri");
         } else if (recordType == ProvenanceEventType.ROUTE) {
-            writeNullableString(out, record.getRelationship());
+            writeNullableString(out, record.getRelationship(), "Relationship");
         }
     }
 
     protected void writeUUID(final DataOutputStream out, final String uuid) 
throws IOException {
-        writeUTFLimited(out, uuid);
+        writeUTFLimited(out, uuid, "UUID");
     }
 
     protected void writeUUIDs(final DataOutputStream out, final 
Collection<String> list) throws IOException {
@@ -175,12 +175,12 @@ public class StandardRecordWriter extends 
CompressableRecordWriter implements Re
         }
     }
 
-    protected void writeNullableString(final DataOutputStream out, final 
String toWrite) throws IOException {
+    protected void writeNullableString(final DataOutputStream out, final 
String toWrite, String fieldName) throws IOException {
         if (toWrite == null) {
             out.writeBoolean(false);
         } else {
             out.writeBoolean(true);
-            writeUTFLimited(out, toWrite);
+            writeUTFLimited(out, toWrite, fieldName);
         }
     }
 
@@ -199,14 +199,14 @@ public class StandardRecordWriter extends 
CompressableRecordWriter implements Re
         out.write(bytes);
     }
 
-    private void writeUTFLimited(final java.io.DataOutputStream out, final 
String utfString) throws IOException {
+    private void writeUTFLimited(final DataOutputStream out, final String 
utfString, final String fieldName) throws IOException {
         try {
             out.writeUTF(utfString);
         } catch (UTFDataFormatException e) {
-            final String truncated = utfString.substring(0, 
getCharsInUTFLength(utfString, MAX_ALLOWED_UTF_LENGTH));
-            logger.warn("Truncating repository record value!  Attempted to 
write {} chars that encode to a UTF byte length greater than "
+            final String truncated = utfString.substring(0, 
getCharsInUTF8Limit(utfString, MAX_ALLOWED_UTF_LENGTH));
+            logger.warn("Truncating repository record value for field '{}'!  
Attempted to write {} chars that encode to a UTF8 byte length greater than "
                             + "supported maximum ({}), truncating to {} 
chars.",
-                    utfString.length(), MAX_ALLOWED_UTF_LENGTH, 
truncated.length());
+                    (fieldName == null) ? "" : fieldName, utfString.length(), 
MAX_ALLOWED_UTF_LENGTH, truncated.length());
             if (logger.isDebugEnabled()) {
                 logger.warn("String value was:\n{}", truncated);
             }
@@ -214,27 +214,29 @@ public class StandardRecordWriter extends 
CompressableRecordWriter implements Re
         }
     }
 
-    static int getCharsInUTFLength(final String str, final int utfLimit) {
-        // see java.io.DataOutputStream.writeUTF()
-        int strlen = str.length();
-        int utflen = 0;
-        int c;
-
-        /* use charAt instead of copying String to Char array */
-        for (int i = 0; i < strlen; i++) {
-            c = str.charAt(i);
-            if ((c >= 0x0001) & (c <= 0x007F)) {
-                utflen++;
-            } else if (c > 0x07FF) {
-                utflen += 3;
+    static int getCharsInUTF8Limit(final String str, final int utf8Limit) {
+        // Calculate how much of String fits within UTF8 byte limit based on 
RFC3629.
+        //
+        // Java String values use char[] for storage, so character values 
>0xFFFF that
+        // map to 4 byte UTF8 representations are not considered.
+
+        final int charsInOriginal = str.length();
+        int bytesInUTF8 = 0;
+
+        for (int i = 0; i < charsInOriginal; i++) {
+            final int curr = str.charAt(i);
+            if (curr < 0x0080) {
+                bytesInUTF8++;
+            } else if (curr < 0x0800) {
+                bytesInUTF8 += 2;
             } else {
-                utflen += 2;
+                bytesInUTF8 += 3;
             }
-            if (utflen > utfLimit) {
+            if (bytesInUTF8 > utf8Limit) {
                 return i;
             }
         }
-        return strlen;
+        return charsInOriginal;
     }
 
 

Reply via email to