Repository: orc Updated Branches: refs/heads/master 9521a9e4f -> 55104ee79
ORC-406: ORC: Char(n) and Varchar(n) writers truncate to n bytes & corrupts multi-byte data (gopalv) Fixes #310 Signed-off-by: Gopal V <gop...@apache.org> Signed-off-by: Owen O'Malley <omal...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/55104ee7 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/55104ee7 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/55104ee7 Branch: refs/heads/master Commit: 55104ee794ce2c7cf5c00f082d1d675fd0c699f8 Parents: 9521a9e Author: Gopal V <gop...@apache.org> Authored: Wed Sep 19 01:49:23 2018 -0700 Committer: Owen O'Malley <omal...@apache.org> Committed: Thu Sep 20 15:55:39 2018 -0700 ---------------------------------------------------------------------- java/core/src/java/org/apache/orc/OrcUtils.java | 1 + .../apache/orc/impl/ColumnStatisticsImpl.java | 60 +----- .../src/java/org/apache/orc/impl/Utf8Utils.java | 86 ++++++++ .../apache/orc/impl/writer/CharTreeWriter.java | 114 +++++------ .../orc/impl/writer/VarcharTreeWriter.java | 77 +++----- .../src/test/org/apache/orc/TestUnicode.java | 194 +++++++++++++++++++ 6 files changed, 366 insertions(+), 166 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/orc/blob/55104ee7/java/core/src/java/org/apache/orc/OrcUtils.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java index f1267e1..76e5e16 100644 --- a/java/core/src/java/org/apache/orc/OrcUtils.java +++ b/java/core/src/java/org/apache/orc/OrcUtils.java @@ -602,4 +602,5 @@ public class OrcUtils { } return result; } + } http://git-wip-us.apache.org/repos/asf/orc/blob/55104ee7/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index 1b8c801..62a7563 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -775,32 +775,6 @@ public class ColumnStatisticsImpl implements ColumnStatistics { return result; } - /** - * Find the start of the last character that ends in the current string. - * @param text the bytes of the utf-8 - * @param from the first byte location - * @param until the last byte location - * @return the index of the last character - */ - private static int findLastCharacter(byte[] text, int from, int until) { - int posn = until; - /* we don't expect characters more than 5 bytes */ - while (posn >= from) { - if (getCharLength(text[posn]) > 0) { - return posn; - } - posn -= 1; - } - /* beginning of a valid char not found */ - throw new IllegalArgumentException( - "Could not truncate string, beginning of a valid char not found"); - } - - private static int getCodePoint(byte[] source, int from, int len) { - return new String(source, from, len, StandardCharsets.UTF_8) - .codePointAt(0); - } - private static void appendCodePoint(Text result, int codepoint) { if (codepoint < 0 || codepoint > 0x1f_ffff) { throw new IllegalArgumentException("Codepoint out of range " + @@ -837,13 +811,13 @@ public class ColumnStatisticsImpl implements ColumnStatistics { * @return truncated Text value */ private static Text truncateUpperBound(final byte[] text, final int from) { - int followingChar = findLastCharacter(text, from, + int followingChar = Utf8Utils.findLastCharacter(text, from, from + MAX_BYTES_RECORDED); - int lastChar = findLastCharacter(text, from, followingChar - 1); + int lastChar = Utf8Utils.findLastCharacter(text, from, followingChar - 1); Text result = new Text(); result.set(text, from, lastChar - from); appendCodePoint(result, - getCodePoint(text, lastChar, followingChar - lastChar) + 1); + Utf8Utils.getCodePoint(text, lastChar, followingChar - lastChar) + 1); return result; } @@ -857,36 +831,12 @@ public class ColumnStatisticsImpl implements ColumnStatistics { */ private static Text truncateLowerBound(final byte[] text, final int from) { - int lastChar = findLastCharacter(text, from, from + MAX_BYTES_RECORDED); + int lastChar = Utf8Utils.findLastCharacter(text, from, + from + MAX_BYTES_RECORDED); Text result = new Text(); result.set(text, from, lastChar - from); return result; } - - /** - * A helper function that returns the length of the UTF-8 character - * IF the given byte is beginning of a valid char. - * In case it is a beginning byte, a value greater than 0 - * is returned (length of character in bytes). - * Else 0 is returned - * @param b - * @return 0 if not beginning of char else length of char in bytes - */ - private static int getCharLength(byte b) { - int len = 0; - if((b & 0b10000000) == 0b00000000 ) { - len = 1; - } else if ((b & 0b11100000) == 0b11000000 ) { - len = 2; - } else if ((b & 0b11110000) == 0b11100000 ) { - len = 3; - } else if ((b & 0b11111000) == 0b11110000 ) { - len = 4; - } else if ((b & 0b11111100) == 0b11111000 ) { - len = 5; - } - return len; - } } protected static final class BinaryStatisticsImpl extends ColumnStatisticsImpl implements http://git-wip-us.apache.org/repos/asf/orc/blob/55104ee7/java/core/src/java/org/apache/orc/impl/Utf8Utils.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/Utf8Utils.java b/java/core/src/java/org/apache/orc/impl/Utf8Utils.java new file mode 100644 index 0000000..6ed6f4d --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/Utf8Utils.java @@ -0,0 +1,86 @@ +package org.apache.orc.impl; + +import java.nio.charset.StandardCharsets; + +public final class Utf8Utils { + + public static int charLength(byte[] data, int offset, int length) { + int chars = 0; + for (int i = 0; i < length; i++) { + if (isUtfStartByte(data[offset +i ])) { + chars++; + } + } + return chars; + } + + /** + * Return the number of bytes required to read at most + * maxLength characters in full from a utf-8 encoded byte array provided + * by data[offset:offset+length]. This does not validate utf-8 data, but + * operates correctly on already valid utf-8 data. + * + * @param maxCharLength + * @param data + * @param offset + * @param length + */ + public static int truncateBytesTo(int maxCharLength, byte[] data, int offset, int length) { + int chars = 0; + if (length <= maxCharLength) { + return length; + } + for (int i = 0; i < length; i++) { + if (isUtfStartByte(data[offset +i ])) { + chars++; + } + if (chars > maxCharLength) { + return i; + } + } + // everything fits + return length; + } + + /** + * Checks if b is the first byte of a UTF-8 character. + * + */ + public static boolean isUtfStartByte(byte b) { + return (b & 0xC0) != 0x80; + } + + /** + * Find the start of the last character that ends in the current string. + * @param text the bytes of the utf-8 + * @param from the first byte location + * @param until the last byte location + * @return the index of the last character + */ + public static int findLastCharacter(byte[] text, int from, int until) { + int posn = until; + /* we don't expect characters more than 5 bytes */ + while (posn >= from) { + if (isUtfStartByte(text[posn])) { + return posn; + } + posn -= 1; + } + /* beginning of a valid char not found */ + throw new IllegalArgumentException( + "Could not truncate string, beginning of a valid char not found"); + } + + /** + * Get the code point at a given location in the byte array. + * @param source the bytes of the string + * @param from the offset to start at + * @param len the number of bytes in the character + * @return the code point + */ + public static int getCodePoint(byte[] source, int from, int len) { + return new String(source, from, len, StandardCharsets.UTF_8) + .codePointAt(0); + } + +} http://git-wip-us.apache.org/repos/asf/orc/blob/55104ee7/java/core/src/java/org/apache/orc/impl/writer/CharTreeWriter.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/writer/CharTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/CharTreeWriter.java index 30f2d92..14e3c26 100644 --- a/java/core/src/java/org/apache/orc/impl/writer/CharTreeWriter.java +++ b/java/core/src/java/org/apache/orc/impl/writer/CharTreeWriter.java @@ -21,6 +21,7 @@ package org.apache.orc.impl.writer; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.orc.TypeDescription; +import org.apache.orc.impl.Utf8Utils; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -30,7 +31,7 @@ import java.util.Arrays; * Under the covers, char is written to ORC the same way as string. */ public class CharTreeWriter extends StringBaseTreeWriter { - private final int itemLength; + private final int maxLength; private final byte[] padding; CharTreeWriter(int columnId, @@ -38,8 +39,9 @@ public class CharTreeWriter extends StringBaseTreeWriter { WriterContext writer, boolean nullable) throws IOException { super(columnId, schema, writer, nullable); - itemLength = schema.getMaxLength(); - padding = new byte[itemLength]; + maxLength = schema.getMaxLength(); + // utf-8 is currently 4 bytes long, but it could be upto 6 + padding = new byte[6*maxLength]; } @Override @@ -49,74 +51,56 @@ public class CharTreeWriter extends StringBaseTreeWriter { BytesColumnVector vec = (BytesColumnVector) vector; if (vector.isRepeating) { if (vector.noNulls || !vector.isNull[0]) { - byte[] ptr; - int ptrOffset; - if (vec.length[0] >= itemLength) { - ptr = vec.vector[0]; - ptrOffset = vec.start[0]; - } else { - ptr = padding; - ptrOffset = 0; - System.arraycopy(vec.vector[0], vec.start[0], ptr, 0, - vec.length[0]); - Arrays.fill(ptr, vec.length[0], itemLength, (byte) ' '); - } - if (useDictionaryEncoding) { - int id = dictionary.add(ptr, ptrOffset, itemLength); - for(int i=0; i < length; ++i) { - rows.add(id); - } - } else { - for(int i=0; i < length; ++i) { - directStreamOutput.write(ptr, ptrOffset, itemLength); - lengthOutput.write(itemLength); - } - } - indexStatistics.updateString(ptr, ptrOffset, itemLength, length); - if (createBloomFilter) { - if (bloomFilter != null) { - // translate from UTF-8 to the default charset - bloomFilter.addString(new String(vec.vector[0], vec.start[0], - vec.length[0], StandardCharsets.UTF_8)); - } - bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]); - } + // 0, length times + writePadded(vec, 0, length); } } else { for(int i=0; i < length; ++i) { if (vec.noNulls || !vec.isNull[i + offset]) { - byte[] ptr; - int ptrOffset; - if (vec.length[offset + i] >= itemLength) { - ptr = vec.vector[offset + i]; - ptrOffset = vec.start[offset + i]; - } else { - // it is the wrong length, so copy it - ptr = padding; - ptrOffset = 0; - System.arraycopy(vec.vector[offset + i], vec.start[offset + i], - ptr, 0, vec.length[offset + i]); - Arrays.fill(ptr, vec.length[offset + i], itemLength, (byte) ' '); - } - if (useDictionaryEncoding) { - rows.add(dictionary.add(ptr, ptrOffset, itemLength)); - } else { - directStreamOutput.write(ptr, ptrOffset, itemLength); - lengthOutput.write(itemLength); - } - indexStatistics.updateString(ptr, ptrOffset, itemLength, 1); - if (createBloomFilter) { - if (bloomFilter != null) { - // translate from UTF-8 to the default charset - bloomFilter.addString(new String(vec.vector[offset + i], - vec.start[offset + i], vec.length[offset + i], - StandardCharsets.UTF_8)); - } - bloomFilterUtf8.addBytes(vec.vector[offset + i], - vec.start[offset + i], vec.length[offset + i]); - } + // offset + i, once per loop + writePadded(vec, i + offset, 1); } } } } + + private void writePadded(BytesColumnVector vec, int row, int repeats) throws IOException { + final byte[] ptr; + final int ptrOffset; + final int ptrLength; + int charLength = Utf8Utils.charLength(vec.vector[row], vec.start[row], vec.length[row]); + if (charLength >= maxLength) { + ptr = vec.vector[row]; + ptrOffset = vec.start[row]; + ptrLength = + Utf8Utils + .truncateBytesTo(maxLength, vec.vector[row], vec.start[row], vec.length[row]); + } else { + ptr = padding; + // the padding is exactly 1 byte per char + ptrLength = vec.length[row] + (maxLength - charLength); + ptrOffset = 0; + System.arraycopy(vec.vector[row], vec.start[row], ptr, 0, vec.length[row]); + Arrays.fill(ptr, vec.length[row], ptrLength, (byte) ' '); + } + if (useDictionaryEncoding) { + int id = dictionary.add(ptr, ptrOffset, ptrLength); + for (int i = 0; i < repeats; ++i) { + rows.add(id); + } + } else { + for (int i = 0; i < repeats; ++i) { + directStreamOutput.write(ptr, ptrOffset, ptrLength); + lengthOutput.write(ptrLength); + } + } + indexStatistics.updateString(ptr, ptrOffset, ptrLength, repeats); + if (createBloomFilter) { + if (bloomFilter != null) { + // translate from UTF-8 to the default charset + bloomFilter.addString(new String(ptr, ptrOffset, ptrLength, StandardCharsets.UTF_8)); + } + bloomFilterUtf8.addBytes(ptr, ptrOffset, ptrLength); + } + } } http://git-wip-us.apache.org/repos/asf/orc/blob/55104ee7/java/core/src/java/org/apache/orc/impl/writer/VarcharTreeWriter.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/writer/VarcharTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/VarcharTreeWriter.java index 17d3f61..b08ef43 100644 --- a/java/core/src/java/org/apache/orc/impl/writer/VarcharTreeWriter.java +++ b/java/core/src/java/org/apache/orc/impl/writer/VarcharTreeWriter.java @@ -21,6 +21,7 @@ package org.apache.orc.impl.writer; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.orc.TypeDescription; +import org.apache.orc.impl.Utf8Utils; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -46,58 +47,42 @@ public class VarcharTreeWriter extends StringBaseTreeWriter { BytesColumnVector vec = (BytesColumnVector) vector; if (vector.isRepeating) { if (vector.noNulls || !vector.isNull[0]) { - int itemLength = Math.min(vec.length[0], maxLength); - if (useDictionaryEncoding) { - int id = dictionary.add(vec.vector[0], vec.start[0], itemLength); - for(int i=0; i < length; ++i) { - rows.add(id); - } - } else { - for(int i=0; i < length; ++i) { - directStreamOutput.write(vec.vector[0], vec.start[0], - itemLength); - lengthOutput.write(itemLength); - } - } - indexStatistics.updateString(vec.vector[0], vec.start[0], - itemLength, length); - if (createBloomFilter) { - if (bloomFilter != null) { - // translate from UTF-8 to the default charset - bloomFilter.addString(new String(vec.vector[0], - vec.start[0], itemLength, - StandardCharsets.UTF_8)); - } - bloomFilterUtf8.addBytes(vec.vector[0], - vec.start[0], itemLength); - } + // 0, length times + writeTruncated(vec, 0, length); } } else { for(int i=0; i < length; ++i) { if (vec.noNulls || !vec.isNull[i + offset]) { - int itemLength = Math.min(vec.length[offset + i], maxLength); - if (useDictionaryEncoding) { - rows.add(dictionary.add(vec.vector[offset + i], - vec.start[offset + i], itemLength)); - } else { - directStreamOutput.write(vec.vector[offset + i], - vec.start[offset + i], itemLength); - lengthOutput.write(itemLength); - } - indexStatistics.updateString(vec.vector[offset + i], - vec.start[offset + i], itemLength, 1); - if (createBloomFilter) { - if (bloomFilter != null) { - // translate from UTF-8 to the default charset - bloomFilter.addString(new String(vec.vector[offset + i], - vec.start[offset + i], itemLength, - StandardCharsets.UTF_8)); - } - bloomFilterUtf8.addBytes(vec.vector[offset + i], - vec.start[offset + i], itemLength); - } + // offset + i, once per loop + writeTruncated(vec, i + offset, 1); } } } } + + private void writeTruncated(BytesColumnVector vec, int row, int repeats) + throws IOException { + int itemLength = + Utf8Utils.truncateBytesTo(maxLength, vec.vector[row], vec.start[row], vec.length[row]); + if (useDictionaryEncoding) { + int id = dictionary.add(vec.vector[row], vec.start[row], itemLength); + for (int i = 0; i < repeats; ++i) { + rows.add(id); + } + } else { + for (int i = 0; i < repeats; ++i) { + directStreamOutput.write(vec.vector[row], vec.start[row], itemLength); + lengthOutput.write(itemLength); + } + } + indexStatistics.updateString(vec.vector[row], vec.start[row], itemLength, repeats); + if (createBloomFilter) { + if (bloomFilter != null) { + // translate from UTF-8 to the default charset + bloomFilter.addString(new String(vec.vector[row], vec.start[row], itemLength, + StandardCharsets.UTF_8)); + } + bloomFilterUtf8.addBytes(vec.vector[row], vec.start[row], itemLength); + } + } } http://git-wip-us.apache.org/repos/asf/orc/blob/55104ee7/java/core/src/test/org/apache/orc/TestUnicode.java ---------------------------------------------------------------------- diff --git a/java/core/src/test/org/apache/orc/TestUnicode.java b/java/core/src/test/org/apache/orc/TestUnicode.java new file mode 100644 index 0000000..45565a0 --- /dev/null +++ b/java/core/src/test/org/apache/orc/TestUnicode.java @@ -0,0 +1,194 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collection; + +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(Parameterized.class) +public class TestUnicode { + Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + + File.separator + "tmp")); + + Configuration conf; + FileSystem fs; + Path testFilePath; + + private final String type; + private final int maxLength; + private final boolean hasRTrim; + + @Parameters + public static Collection<Object[]> data() { + ArrayList<Object[]> data = new ArrayList<>(); + for (int j = 0; j < 2; j++) { + for (int i = 1; i <= 5; i++) { + data.add(new Object[] { j == 0 ? "char" : "varchar", i, true }); + } + } + //data.add(new Object[] {"char", 3}); + return data; + } + + public TestUnicode(String type, int maxLength, boolean hasRTrim) { + this.type = type; + this.maxLength = maxLength; + this.hasRTrim = hasRTrim; + } + + static final String[] utf8strs = new String[] { + // Character.UnicodeBlock GREEK (2 bytes) + "\u03b1\u03b2\u03b3", "\u03b1\u03b2", "\u03b1\u03b2\u03b3\u03b4", + "\u03b1\u03b2\u03b3\u03b4", + // Character.UnicodeBlock MALAYALAM (3 bytes) + "\u0d06\u0d30\u0d3e", "\u0d0e\u0d28\u0d4d\u0d24\u0d3e", "\u0d13\u0d7c\u0d15\u0d4d", + // Unicode emoji (4 bytes) + "\u270f\ufe0f\ud83d\udcdd\u270f\ufe0f", "\ud83c\udf3b\ud83d\udc1d\ud83c\udf6f", + "\ud83c\udf7a\ud83e\udd43\ud83c\udf77" }; + + @Rule + public TestName testCaseName = new TestName(); + + @Before + public void openFileSystem() throws Exception { + conf = new Configuration(); + fs = FileSystem.getLocal(conf); + testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc"); + fs.delete(testFilePath, false); + } + + @Test + public void testUtf8() throws Exception { + if (type == "varchar") { + testVarChar(maxLength); + } else { + testChar(maxLength); + } + } + + // copied from HiveBaseChar + public static String enforceMaxLength(String val, int maxLength) { + if (val == null) { + return null; + } + String value = val; + + if (maxLength > 0) { + int valLength = val.codePointCount(0, val.length()); + if (valLength > maxLength) { + // Truncate the excess chars to fit the character length. + // Also make sure we take supplementary chars into account. + value = val.substring(0, val.offsetByCodePoints(0, maxLength)); + } + } + return value; + } + + // copied from HiveBaseChar + public static String getPaddedValue(String val, int maxLength, boolean rtrim) { + if (val == null) { + return null; + } + if (maxLength < 0) { + return val; + } + + int valLength = val.codePointCount(0, val.length()); + if (valLength > maxLength) { + return enforceMaxLength(val, maxLength); + } + + if (maxLength > valLength && rtrim == false) { + // Make sure we pad the right amount of spaces; valLength is in terms of code points, + // while StringUtils.rpad() is based on the number of java chars. + int padLength = val.length() + (maxLength - valLength); + val = StringUtils.rightPad(val, padLength); + } + return val; + } + + public void testChar(int maxLength) throws Exception { + // char(n) + TypeDescription schema = TypeDescription.createChar().withMaxLength(maxLength); + String[] expected = new String[utf8strs.length]; + for (int i = 0; i < utf8strs.length; i++) { + expected[i] = getPaddedValue(utf8strs[i], maxLength, hasRTrim); + } + verifyWrittenStrings(schema, utf8strs, expected); + } + + public void testVarChar(int maxLength) throws Exception { + // char(n) + TypeDescription schema = TypeDescription.createVarchar().withMaxLength(maxLength); + String[] expected = new String[utf8strs.length]; + for (int i = 0; i < utf8strs.length; i++) { + expected[i] = enforceMaxLength(utf8strs[i], maxLength); + } + verifyWrittenStrings(schema, utf8strs, expected); + } + + public void verifyWrittenStrings(TypeDescription schema, String[] inputs, String[] expected) + throws Exception { + Writer writer = + OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).setSchema(schema) + .compress(CompressionKind.NONE).bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector col = (BytesColumnVector) batch.cols[0]; + for (int i = 0; i < inputs.length; i++) { + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + col.setVal(batch.size++, inputs[i].getBytes()); + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = + OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + col = (BytesColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for (int r = 0; r < batch.size; ++r) { + assertEquals(String.format("test for %s:%d", schema, maxLength), expected[idx], + col.toString(r)); + idx++; + } + } + fs.delete(testFilePath, false); + } +}