orc git commit: ORC-406: ORC: Char(n) and Varchar(n) writers truncate to n bytes & corrupts multi-byte data (gopalv)

omalley Thu, 20 Sep 2018 15:57:01 -0700

Repository: orc
Updated Branches:
  refs/heads/master 9521a9e4f -> 55104ee79



ORC-406: ORC: Char(n) and Varchar(n) writers truncate to n bytes & corrupts
multi-byte data (gopalv)

Fixes #310

Signed-off-by: Gopal V <gop...@apache.org>
Signed-off-by: Owen O'Malley <omal...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/55104ee7
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/55104ee7
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/55104ee7

Branch: refs/heads/master
Commit: 55104ee794ce2c7cf5c00f082d1d675fd0c699f8
Parents: 9521a9e
Author: Gopal V <gop...@apache.org>
Authored: Wed Sep 19 01:49:23 2018 -0700
Committer: Owen O'Malley <omal...@apache.org>
Committed: Thu Sep 20 15:55:39 2018 -0700

----------------------------------------------------------------------
 java/core/src/java/org/apache/orc/OrcUtils.java |   1 +
 .../apache/orc/impl/ColumnStatisticsImpl.java   |  60 +-----
 .../src/java/org/apache/orc/impl/Utf8Utils.java |  86 ++++++++
 .../apache/orc/impl/writer/CharTreeWriter.java  | 114 +++++------
 .../orc/impl/writer/VarcharTreeWriter.java      |  77 +++-----
 .../src/test/org/apache/orc/TestUnicode.java    | 194 +++++++++++++++++++
 6 files changed, 366 insertions(+), 166 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/55104ee7/java/core/src/java/org/apache/orc/OrcUtils.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java 
b/java/core/src/java/org/apache/orc/OrcUtils.java
index f1267e1..76e5e16 100644
--- a/java/core/src/java/org/apache/orc/OrcUtils.java
+++ b/java/core/src/java/org/apache/orc/OrcUtils.java
@@ -602,4 +602,5 @@ public class OrcUtils {
     }
     return result;
   }
+
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/55104ee7/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java 
b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
index 1b8c801..62a7563 100644
--- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
@@ -775,32 +775,6 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
       return result;
     }
 
-    /**
-     * Find the start of the last character that ends in the current string.
-     * @param text the bytes of the utf-8
-     * @param from the first byte location
-     * @param until the last byte location
-     * @return the index of the last character
-     */
-    private static int findLastCharacter(byte[] text, int from, int until) {
-      int posn = until;
-      /* we don't expect characters more than 5 bytes */
-      while (posn >= from) {
-        if (getCharLength(text[posn]) > 0) {
-          return posn;
-        }
-        posn -= 1;
-      }
-      /* beginning of a valid char not found */
-      throw new IllegalArgumentException(
-          "Could not truncate string, beginning of a valid char not found");
-    }
-
-    private static int getCodePoint(byte[] source, int from, int len) {
-      return new String(source, from, len, StandardCharsets.UTF_8)
-          .codePointAt(0);
-    }
-
     private static void appendCodePoint(Text result, int codepoint) {
       if (codepoint < 0 || codepoint > 0x1f_ffff) {
         throw new IllegalArgumentException("Codepoint out of range " +
@@ -837,13 +811,13 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
      * @return truncated Text value
      */
     private static Text truncateUpperBound(final byte[] text, final int from) {
-      int followingChar = findLastCharacter(text, from,
+      int followingChar = Utf8Utils.findLastCharacter(text, from,
           from + MAX_BYTES_RECORDED);
-      int lastChar = findLastCharacter(text, from, followingChar - 1);
+      int lastChar = Utf8Utils.findLastCharacter(text, from, followingChar - 
1);
       Text result = new Text();
       result.set(text, from, lastChar - from);
       appendCodePoint(result,
-          getCodePoint(text, lastChar, followingChar - lastChar) + 1);
+          Utf8Utils.getCodePoint(text, lastChar, followingChar - lastChar) + 
1);
       return result;
     }
 
@@ -857,36 +831,12 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
      */
     private static Text truncateLowerBound(final byte[] text, final int from) {
 
-      int lastChar = findLastCharacter(text, from, from + MAX_BYTES_RECORDED);
+      int lastChar = Utf8Utils.findLastCharacter(text, from,
+          from + MAX_BYTES_RECORDED);
       Text result = new Text();
       result.set(text, from, lastChar - from);
       return result;
     }
-
-    /**
-     * A helper function that returns the length of the UTF-8 character
-     * IF the given byte is beginning of a valid char.
-     * In case it is a beginning byte, a value greater than 0
-     * is returned (length of character in bytes).
-     * Else 0 is returned
-     * @param b
-     * @return 0 if not beginning of char else length of char in bytes
-     */
-    private static int getCharLength(byte b) {
-      int len = 0;
-      if((b & 0b10000000) == 0b00000000 ) {
-        len = 1;
-      } else if ((b & 0b11100000) == 0b11000000 ) {
-        len = 2;
-      } else if ((b & 0b11110000) == 0b11100000 ) {
-        len = 3;
-      } else if ((b & 0b11111000) == 0b11110000 ) {
-        len = 4;
-      } else if ((b & 0b11111100) == 0b11111000 ) {
-        len = 5;
-      }
-      return len;
-    }
   }
 
   protected static final class BinaryStatisticsImpl extends 
ColumnStatisticsImpl implements

http://git-wip-us.apache.org/repos/asf/orc/blob/55104ee7/java/core/src/java/org/apache/orc/impl/Utf8Utils.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/Utf8Utils.java 
b/java/core/src/java/org/apache/orc/impl/Utf8Utils.java
new file mode 100644
index 0000000..6ed6f4d
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/impl/Utf8Utils.java
@@ -0,0 +1,86 @@
+package org.apache.orc.impl;
+
+import java.nio.charset.StandardCharsets;
+
+public final class Utf8Utils {
+
+  public static int charLength(byte[] data, int offset, int length) {
+    int chars = 0;
+    for (int i = 0; i < length; i++) {
+      if (isUtfStartByte(data[offset +i ])) {
+        chars++;
+      }
+    }
+    return chars;
+  }
+
+  /**
+   * Return the number of bytes required to read at most
+   * maxLength characters in full from a utf-8 encoded byte array provided
+   * by data[offset:offset+length]. This does not validate utf-8 data, but
+   * operates correctly on already valid utf-8 data.
+   *
+   * @param maxCharLength
+   * @param data
+   * @param offset
+   * @param length
+   */
+  public static int truncateBytesTo(int maxCharLength, byte[] data, int 
offset, int length) {
+    int chars = 0;
+    if (length <= maxCharLength) {
+      return length;
+    }
+    for (int i = 0; i < length; i++) {
+      if (isUtfStartByte(data[offset +i ])) {
+        chars++;
+      }
+      if (chars > maxCharLength) {
+        return i;
+      }
+    }
+    // everything fits
+    return length;
+  }
+
+  /**
+   * Checks if b is the first byte of a UTF-8 character.
+   *
+   */
+  public static boolean isUtfStartByte(byte b) {
+    return (b & 0xC0) != 0x80;
+  }
+
+  /**
+   * Find the start of the last character that ends in the current string.
+   * @param text the bytes of the utf-8
+   * @param from the first byte location
+   * @param until the last byte location
+   * @return the index of the last character
+   */
+  public static int findLastCharacter(byte[] text, int from, int until) {
+    int posn = until;
+    /* we don't expect characters more than 5 bytes */
+    while (posn >= from) {
+      if (isUtfStartByte(text[posn])) {
+        return posn;
+      }
+      posn -= 1;
+    }
+    /* beginning of a valid char not found */
+    throw new IllegalArgumentException(
+        "Could not truncate string, beginning of a valid char not found");
+  }
+
+  /**
+   * Get the code point at a given location in the byte array.
+   * @param source the bytes of the string
+   * @param from the offset to start at
+   * @param len the number of bytes in the character
+   * @return the code point
+   */
+  public static int getCodePoint(byte[] source, int from, int len) {
+    return new String(source, from, len, StandardCharsets.UTF_8)
+        .codePointAt(0);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/55104ee7/java/core/src/java/org/apache/orc/impl/writer/CharTreeWriter.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/writer/CharTreeWriter.java 
b/java/core/src/java/org/apache/orc/impl/writer/CharTreeWriter.java
index 30f2d92..14e3c26 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/CharTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/CharTreeWriter.java
@@ -21,6 +21,7 @@ package org.apache.orc.impl.writer;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
 import org.apache.orc.TypeDescription;
+import org.apache.orc.impl.Utf8Utils;
 
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
@@ -30,7 +31,7 @@ import java.util.Arrays;
  * Under the covers, char is written to ORC the same way as string.
  */
 public class CharTreeWriter extends StringBaseTreeWriter {
-  private final int itemLength;
+  private final int maxLength;
   private final byte[] padding;
 
   CharTreeWriter(int columnId,
@@ -38,8 +39,9 @@ public class CharTreeWriter extends StringBaseTreeWriter {
                  WriterContext writer,
                  boolean nullable) throws IOException {
     super(columnId, schema, writer, nullable);
-    itemLength = schema.getMaxLength();
-    padding = new byte[itemLength];
+    maxLength = schema.getMaxLength();
+    // utf-8 is currently 4 bytes long, but it could be upto 6
+    padding = new byte[6*maxLength];
   }
 
   @Override
@@ -49,74 +51,56 @@ public class CharTreeWriter extends StringBaseTreeWriter {
     BytesColumnVector vec = (BytesColumnVector) vector;
     if (vector.isRepeating) {
       if (vector.noNulls || !vector.isNull[0]) {
-        byte[] ptr;
-        int ptrOffset;
-        if (vec.length[0] >= itemLength) {
-          ptr = vec.vector[0];
-          ptrOffset = vec.start[0];
-        } else {
-          ptr = padding;
-          ptrOffset = 0;
-          System.arraycopy(vec.vector[0], vec.start[0], ptr, 0,
-              vec.length[0]);
-          Arrays.fill(ptr, vec.length[0], itemLength, (byte) ' ');
-        }
-        if (useDictionaryEncoding) {
-          int id = dictionary.add(ptr, ptrOffset, itemLength);
-          for(int i=0; i < length; ++i) {
-            rows.add(id);
-          }
-        } else {
-          for(int i=0; i < length; ++i) {
-            directStreamOutput.write(ptr, ptrOffset, itemLength);
-            lengthOutput.write(itemLength);
-          }
-        }
-        indexStatistics.updateString(ptr, ptrOffset, itemLength, length);
-        if (createBloomFilter) {
-          if (bloomFilter != null) {
-            // translate from UTF-8 to the default charset
-            bloomFilter.addString(new String(vec.vector[0], vec.start[0],
-                vec.length[0], StandardCharsets.UTF_8));
-          }
-          bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
-        }
+        // 0, length times
+        writePadded(vec, 0, length);
       }
     } else {
       for(int i=0; i < length; ++i) {
         if (vec.noNulls || !vec.isNull[i + offset]) {
-          byte[] ptr;
-          int ptrOffset;
-          if (vec.length[offset + i] >= itemLength) {
-            ptr = vec.vector[offset + i];
-            ptrOffset = vec.start[offset + i];
-          } else {
-            // it is the wrong length, so copy it
-            ptr = padding;
-            ptrOffset = 0;
-            System.arraycopy(vec.vector[offset + i], vec.start[offset + i],
-                ptr, 0, vec.length[offset + i]);
-            Arrays.fill(ptr, vec.length[offset + i], itemLength, (byte) ' ');
-          }
-          if (useDictionaryEncoding) {
-            rows.add(dictionary.add(ptr, ptrOffset, itemLength));
-          } else {
-            directStreamOutput.write(ptr, ptrOffset, itemLength);
-            lengthOutput.write(itemLength);
-          }
-          indexStatistics.updateString(ptr, ptrOffset, itemLength, 1);
-          if (createBloomFilter) {
-            if (bloomFilter != null) {
-              // translate from UTF-8 to the default charset
-              bloomFilter.addString(new String(vec.vector[offset + i],
-                  vec.start[offset + i], vec.length[offset + i],
-                  StandardCharsets.UTF_8));
-            }
-            bloomFilterUtf8.addBytes(vec.vector[offset + i],
-                vec.start[offset + i], vec.length[offset + i]);
-          }
+          // offset + i, once per loop
+         writePadded(vec, i + offset, 1);
         }
       }
     }
   }
+
+  private void writePadded(BytesColumnVector vec, int row, int repeats) throws 
IOException {
+    final byte[] ptr;
+    final int ptrOffset;
+    final int ptrLength;
+    int charLength = Utf8Utils.charLength(vec.vector[row], vec.start[row], 
vec.length[row]);
+    if (charLength >= maxLength) {
+      ptr = vec.vector[row];
+      ptrOffset = vec.start[row];
+      ptrLength =
+          Utf8Utils
+              .truncateBytesTo(maxLength, vec.vector[row], vec.start[row], 
vec.length[row]);
+    } else {
+      ptr = padding;
+      // the padding is exactly 1 byte per char
+      ptrLength = vec.length[row] + (maxLength - charLength);
+      ptrOffset = 0;
+      System.arraycopy(vec.vector[row], vec.start[row], ptr, 0, 
vec.length[row]);
+      Arrays.fill(ptr, vec.length[row], ptrLength, (byte) ' ');
+    }
+    if (useDictionaryEncoding) {
+      int id = dictionary.add(ptr, ptrOffset, ptrLength);
+      for (int i = 0; i < repeats; ++i) {
+        rows.add(id);
+      }
+    } else {
+      for (int i = 0; i < repeats; ++i) {
+        directStreamOutput.write(ptr, ptrOffset, ptrLength);
+        lengthOutput.write(ptrLength);
+      }
+    }
+    indexStatistics.updateString(ptr, ptrOffset, ptrLength, repeats);
+    if (createBloomFilter) {
+      if (bloomFilter != null) {
+        // translate from UTF-8 to the default charset
+        bloomFilter.addString(new String(ptr, ptrOffset, ptrLength, 
StandardCharsets.UTF_8));
+      }
+      bloomFilterUtf8.addBytes(ptr, ptrOffset, ptrLength);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/55104ee7/java/core/src/java/org/apache/orc/impl/writer/VarcharTreeWriter.java
----------------------------------------------------------------------
diff --git 
a/java/core/src/java/org/apache/orc/impl/writer/VarcharTreeWriter.java 
b/java/core/src/java/org/apache/orc/impl/writer/VarcharTreeWriter.java
index 17d3f61..b08ef43 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/VarcharTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/VarcharTreeWriter.java
@@ -21,6 +21,7 @@ package org.apache.orc.impl.writer;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
 import org.apache.orc.TypeDescription;
+import org.apache.orc.impl.Utf8Utils;
 
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
@@ -46,58 +47,42 @@ public class VarcharTreeWriter extends StringBaseTreeWriter 
{
     BytesColumnVector vec = (BytesColumnVector) vector;
     if (vector.isRepeating) {
       if (vector.noNulls || !vector.isNull[0]) {
-        int itemLength = Math.min(vec.length[0], maxLength);
-        if (useDictionaryEncoding) {
-          int id = dictionary.add(vec.vector[0], vec.start[0], itemLength);
-          for(int i=0; i < length; ++i) {
-            rows.add(id);
-          }
-        } else {
-          for(int i=0; i < length; ++i) {
-            directStreamOutput.write(vec.vector[0], vec.start[0],
-                itemLength);
-            lengthOutput.write(itemLength);
-          }
-        }
-        indexStatistics.updateString(vec.vector[0], vec.start[0],
-            itemLength, length);
-        if (createBloomFilter) {
-          if (bloomFilter != null) {
-            // translate from UTF-8 to the default charset
-            bloomFilter.addString(new String(vec.vector[0],
-                vec.start[0], itemLength,
-                StandardCharsets.UTF_8));
-          }
-          bloomFilterUtf8.addBytes(vec.vector[0],
-              vec.start[0], itemLength);
-        }
+        // 0, length times
+        writeTruncated(vec, 0, length);
       }
     } else {
       for(int i=0; i < length; ++i) {
         if (vec.noNulls || !vec.isNull[i + offset]) {
-          int itemLength = Math.min(vec.length[offset + i], maxLength);
-          if (useDictionaryEncoding) {
-            rows.add(dictionary.add(vec.vector[offset + i],
-                vec.start[offset + i], itemLength));
-          } else {
-            directStreamOutput.write(vec.vector[offset + i],
-                vec.start[offset + i], itemLength);
-            lengthOutput.write(itemLength);
-          }
-          indexStatistics.updateString(vec.vector[offset + i],
-              vec.start[offset + i], itemLength, 1);
-          if (createBloomFilter) {
-            if (bloomFilter != null) {
-              // translate from UTF-8 to the default charset
-              bloomFilter.addString(new String(vec.vector[offset + i],
-                  vec.start[offset + i], itemLength,
-                  StandardCharsets.UTF_8));
-            }
-            bloomFilterUtf8.addBytes(vec.vector[offset + i],
-                vec.start[offset + i], itemLength);
-          }
+          // offset + i, once per loop
+          writeTruncated(vec, i + offset, 1);
         }
       }
     }
   }
+
+  private void writeTruncated(BytesColumnVector vec, int row, int repeats)
+      throws IOException {
+    int itemLength =
+        Utf8Utils.truncateBytesTo(maxLength, vec.vector[row], vec.start[row], 
vec.length[row]);
+    if (useDictionaryEncoding) {
+      int id = dictionary.add(vec.vector[row], vec.start[row], itemLength);
+      for (int i = 0; i < repeats; ++i) {
+        rows.add(id);
+      }
+    } else {
+      for (int i = 0; i < repeats; ++i) {
+        directStreamOutput.write(vec.vector[row], vec.start[row], itemLength);
+        lengthOutput.write(itemLength);
+      }
+    }
+    indexStatistics.updateString(vec.vector[row], vec.start[row], itemLength, 
repeats);
+    if (createBloomFilter) {
+      if (bloomFilter != null) {
+        // translate from UTF-8 to the default charset
+        bloomFilter.addString(new String(vec.vector[row], vec.start[row], 
itemLength,
+            StandardCharsets.UTF_8));
+      }
+      bloomFilterUtf8.addBytes(vec.vector[row], vec.start[row], itemLength);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/55104ee7/java/core/src/test/org/apache/orc/TestUnicode.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/TestUnicode.java 
b/java/core/src/test/org/apache/orc/TestUnicode.java
new file mode 100644
index 0000000..45565a0
--- /dev/null
+++ b/java/core/src/test/org/apache/orc/TestUnicode.java
@@ -0,0 +1,194 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Collection;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(Parameterized.class)
+public class TestUnicode {
+  Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + 
File.separator + "test"
+      + File.separator + "tmp"));
+
+  Configuration conf;
+  FileSystem fs;
+  Path testFilePath;
+
+  private final String type;
+  private final int maxLength;
+  private final boolean hasRTrim;
+
+  @Parameters
+  public static Collection<Object[]> data() {
+    ArrayList<Object[]> data = new ArrayList<>();
+    for (int j = 0; j < 2; j++) {
+      for (int i = 1; i <= 5; i++) {
+        data.add(new Object[] { j == 0 ? "char" : "varchar", i, true });
+      }
+    }
+    //data.add(new Object[] {"char", 3});
+    return data;
+  }
+
+  public TestUnicode(String type, int maxLength, boolean hasRTrim) {
+    this.type = type;
+    this.maxLength = maxLength;
+    this.hasRTrim = hasRTrim;
+  }
+
+  static final String[] utf8strs = new String[] {
+      // Character.UnicodeBlock GREEK (2 bytes)
+      "\u03b1\u03b2\u03b3", "\u03b1\u03b2", "\u03b1\u03b2\u03b3\u03b4",
+      "\u03b1\u03b2\u03b3\u03b4",
+      // Character.UnicodeBlock MALAYALAM (3 bytes)
+      "\u0d06\u0d30\u0d3e", "\u0d0e\u0d28\u0d4d\u0d24\u0d3e", 
"\u0d13\u0d7c\u0d15\u0d4d",
+      // Unicode emoji (4 bytes)
+      "\u270f\ufe0f\ud83d\udcdd\u270f\ufe0f", 
"\ud83c\udf3b\ud83d\udc1d\ud83c\udf6f",
+      "\ud83c\udf7a\ud83e\udd43\ud83c\udf77" };
+
+  @Rule
+  public TestName testCaseName = new TestName();
+
+  @Before
+  public void openFileSystem() throws Exception {
+    conf = new Configuration();
+    fs = FileSystem.getLocal(conf);
+    testFilePath = new Path(workDir, "TestOrcFile." + 
testCaseName.getMethodName() + ".orc");
+    fs.delete(testFilePath, false);
+  }
+
+  @Test
+  public void testUtf8() throws Exception {
+    if (type == "varchar") {
+      testVarChar(maxLength);
+    } else {
+      testChar(maxLength);
+    }
+  }
+
+  // copied from HiveBaseChar
+  public static String enforceMaxLength(String val, int maxLength) {
+    if (val == null) {
+      return null;
+    }
+    String value = val;
+
+    if (maxLength > 0) {
+      int valLength = val.codePointCount(0, val.length());
+      if (valLength > maxLength) {
+        // Truncate the excess chars to fit the character length.
+        // Also make sure we take supplementary chars into account.
+        value = val.substring(0, val.offsetByCodePoints(0, maxLength));
+      }
+    }
+    return value;
+  }
+
+  // copied from HiveBaseChar
+  public static String getPaddedValue(String val, int maxLength, boolean 
rtrim) {
+    if (val == null) {
+      return null;
+    }
+    if (maxLength < 0) {
+      return val;
+    }
+
+    int valLength = val.codePointCount(0, val.length());
+    if (valLength > maxLength) {
+      return enforceMaxLength(val, maxLength);
+    }
+
+    if (maxLength > valLength && rtrim == false) {
+      // Make sure we pad the right amount of spaces; valLength is in terms of 
code points,
+      // while StringUtils.rpad() is based on the number of java chars.
+      int padLength = val.length() + (maxLength - valLength);
+      val = StringUtils.rightPad(val, padLength);
+    }
+    return val;
+  }
+
+  public void testChar(int maxLength) throws Exception {
+    // char(n)
+    TypeDescription schema = 
TypeDescription.createChar().withMaxLength(maxLength);
+    String[] expected = new String[utf8strs.length];
+    for (int i = 0; i < utf8strs.length; i++) {
+      expected[i] = getPaddedValue(utf8strs[i], maxLength, hasRTrim);
+    }
+    verifyWrittenStrings(schema, utf8strs, expected);
+  }
+
+  public void testVarChar(int maxLength) throws Exception {
+    // char(n)
+    TypeDescription schema = 
TypeDescription.createVarchar().withMaxLength(maxLength);
+    String[] expected = new String[utf8strs.length];
+    for (int i = 0; i < utf8strs.length; i++) {
+      expected[i] = enforceMaxLength(utf8strs[i], maxLength);
+    }
+    verifyWrittenStrings(schema, utf8strs, expected);
+  }
+
+  public void verifyWrittenStrings(TypeDescription schema, String[] inputs, 
String[] expected)
+      throws Exception {
+    Writer writer =
+        OrcFile.createWriter(testFilePath, 
OrcFile.writerOptions(conf).setSchema(schema)
+            .compress(CompressionKind.NONE).bufferSize(10000));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    BytesColumnVector col = (BytesColumnVector) batch.cols[0];
+    for (int i = 0; i < inputs.length; i++) {
+      if (batch.size == batch.getMaxSize()) {
+        writer.addRowBatch(batch);
+        batch.reset();
+      }
+      col.setVal(batch.size++, inputs[i].getBytes());
+    }
+    writer.addRowBatch(batch);
+    writer.close();
+
+    Reader reader =
+        OrcFile.createReader(testFilePath, 
OrcFile.readerOptions(conf).filesystem(fs));
+    RecordReader rows = reader.rows();
+    batch = reader.getSchema().createRowBatch();
+    col = (BytesColumnVector) batch.cols[0];
+    int idx = 0;
+    while (rows.nextBatch(batch)) {
+      for (int r = 0; r < batch.size; ++r) {
+        assertEquals(String.format("test for %s:%d", schema, maxLength), 
expected[idx],
+            col.toString(r));
+        idx++;
+      }
+    }
+    fs.delete(testFilePath, false);
+  }
+}

orc git commit: ORC-406: ORC: Char(n) and Varchar(n) writers truncate to n bytes & corrupts multi-byte data (gopalv)

Reply via email to