Repository: orc Updated Branches: refs/heads/master f47e02cfb -> 5ce07a149
ORC-203: Trim StringStatistics to a maximum size of 1024 bytes. Fixes #299 Signed-off-by: Owen O'Malley <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/5ce07a14 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/5ce07a14 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/5ce07a14 Branch: refs/heads/master Commit: 5ce07a14947bac30c8b0abd3a78dd7f7412c792c Parents: f47e02c Author: Sandeep More <[email protected]> Authored: Wed Jul 18 09:31:02 2018 -0400 Committer: Owen O'Malley <[email protected]> Committed: Thu Sep 13 09:27:25 2018 -0400 ---------------------------------------------------------------------- java/core/src/java/org/apache/orc/OrcFile.java | 3 +- .../org/apache/orc/StringColumnStatistics.java | 16 ++ .../apache/orc/impl/ColumnStatisticsImpl.java | 260 ++++++++++++++++-- .../org/apache/orc/TestColumnStatistics.java | 270 +++++++++++++++++-- .../resources/orc-file-dump-bloomfilter.out | 2 +- .../resources/orc-file-dump-bloomfilter2.out | 2 +- .../orc-file-dump-dictionary-threshold.out | 2 +- .../tools/src/test/resources/orc-file-dump.json | 2 +- java/tools/src/test/resources/orc-file-dump.out | 2 +- .../src/test/resources/orc-file-has-null.out | 2 +- proto/orc_proto.proto | 4 + 11 files changed, 513 insertions(+), 52 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/core/src/java/org/apache/orc/OrcFile.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java index 33aa431..5a25725 100644 --- a/java/core/src/java/org/apache/orc/OrcFile.java +++ b/java/core/src/java/org/apache/orc/OrcFile.java @@ -171,6 +171,7 @@ public class OrcFile { HIVE_13083(WriterImplementation.ORC_JAVA, 4), // decimals write present stream correctly ORC_101(WriterImplementation.ORC_JAVA, 5), // bloom filters use utf8 ORC_135(WriterImplementation.ORC_JAVA, 6), // timestamp stats use utc + ORC_203(WriterImplementation.ORC_JAVA, 7), // trim long strings & record they were trimmed // C++ ORC Writer ORC_CPP_ORIGINAL(WriterImplementation.ORC_CPP, 6), @@ -254,7 +255,7 @@ public class OrcFile { /** * The WriterVersion for this version of the software. */ - public static final WriterVersion CURRENT_WRITER = WriterVersion.ORC_135; + public static final WriterVersion CURRENT_WRITER = WriterVersion.ORC_203; public enum EncodingStrategy { SPEED, COMPRESSION http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/core/src/java/org/apache/orc/StringColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/StringColumnStatistics.java b/java/core/src/java/org/apache/orc/StringColumnStatistics.java index 936b100..9ecbdc9 100644 --- a/java/core/src/java/org/apache/orc/StringColumnStatistics.java +++ b/java/core/src/java/org/apache/orc/StringColumnStatistics.java @@ -34,6 +34,22 @@ public interface StringColumnStatistics extends ColumnStatistics { String getMaximum(); /** + * Get the lower bound of the values in this column. + * The value may be truncated to at most + * MAX_BYTES_RECORDED. + * @return lower bound + */ + String getLowerBound(); + + /** + * Get the upper bound of the values in this column. + * The value may be truncated to at most + * MAX_BYTES_RECORDED. + * @return upper bound + */ + String getUpperBound(); + + /** * Get the total length of all strings * @return the sum (total length) */ http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index be05d80..1b8c801 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -17,13 +17,9 @@ */ package org.apache.orc.impl; -import java.sql.Date; -import java.sql.Timestamp; -import java.util.TimeZone; - -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparator; @@ -39,6 +35,15 @@ import org.apache.orc.StringColumnStatistics; import org.apache.orc.TimestampColumnStatistics; import org.apache.orc.TypeDescription; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.Arrays; +import java.util.TimeZone; + public class ColumnStatisticsImpl implements ColumnStatistics { @Override @@ -517,10 +522,14 @@ public class ColumnStatisticsImpl implements ColumnStatistics { protected static final class StringStatisticsImpl extends ColumnStatisticsImpl implements StringColumnStatistics { + public static final int MAX_BYTES_RECORDED = 1024; private Text minimum = null; private Text maximum = null; private long sum = 0; + private boolean isLowerBoundSet = false; + private boolean isUpperBoundSet = false; + StringStatisticsImpl() { } @@ -543,35 +552,51 @@ public class ColumnStatisticsImpl implements ColumnStatistics { super.reset(); minimum = null; maximum = null; + isLowerBoundSet = false; + isUpperBoundSet = false; sum = 0; } @Override public void updateString(Text value) { - if (minimum == null) { - maximum = minimum = new Text(value); - } else if (minimum.compareTo(value) > 0) { - minimum = new Text(value); - } else if (maximum.compareTo(value) < 0) { - maximum = new Text(value); - } - sum += value.getLength(); + updateString(value.getBytes(), 0, value.getLength(), 1); } @Override public void updateString(byte[] bytes, int offset, int length, int repetitions) { if (minimum == null) { - maximum = minimum = new Text(); - maximum.set(bytes, offset, length); + if(length > MAX_BYTES_RECORDED) { + minimum = truncateLowerBound(bytes, offset); + maximum = truncateUpperBound(bytes, offset); + isLowerBoundSet = true; + isUpperBoundSet = true; + } else { + maximum = minimum = new Text(); + maximum.set(bytes, offset, length); + isLowerBoundSet = false; + isUpperBoundSet = false; + } } else if (WritableComparator.compareBytes(minimum.getBytes(), 0, minimum.getLength(), bytes, offset, length) > 0) { - minimum = new Text(); - minimum.set(bytes, offset, length); + if(length > MAX_BYTES_RECORDED) { + minimum = truncateLowerBound(bytes, offset); + isLowerBoundSet = true; + } else { + minimum = new Text(); + minimum.set(bytes, offset, length); + isLowerBoundSet = false; + } } else if (WritableComparator.compareBytes(maximum.getBytes(), 0, maximum.getLength(), bytes, offset, length) < 0) { - maximum = new Text(); - maximum.set(bytes, offset, length); + if(length > MAX_BYTES_RECORDED) { + maximum = truncateUpperBound(bytes, offset); + isUpperBoundSet = true; + } else { + maximum = new Text(); + maximum.set(bytes, offset, length); + isUpperBoundSet = false; + } } sum += (long)length * repetitions; } @@ -584,16 +609,40 @@ public class ColumnStatisticsImpl implements ColumnStatistics { if (str.minimum != null) { maximum = new Text(str.getMaximum()); minimum = new Text(str.getMinimum()); - } else { + } + /* str.minimum == null when lower bound set */ + else if (str.isLowerBoundSet) { + minimum = new Text(str.getLowerBound()); + isLowerBoundSet = str.isLowerBoundSet; + + /* check for upper bound before setting max */ + if (str.isUpperBoundSet) { + maximum = new Text(str.getUpperBound()); + isUpperBoundSet = str.isUpperBoundSet; + } else { + maximum = new Text(str.getMaximum()); + } + } + else { /* both are empty */ maximum = minimum = null; } } else if (str.minimum != null) { if (minimum.compareTo(str.minimum) > 0) { - minimum = new Text(str.getMinimum()); + if(str.isLowerBoundSet) { + minimum = new Text(str.getLowerBound()); + isLowerBoundSet = str.isLowerBoundSet; + } else { + minimum = new Text(str.getMinimum()); + } } if (maximum.compareTo(str.maximum) < 0) { - maximum = new Text(str.getMaximum()); + if(str.isUpperBoundSet) { + maximum = new Text(str.getUpperBound()); + isUpperBoundSet = str.isUpperBoundSet; + }else { + maximum = new Text(str.getMaximum()); + } } } sum += str.sum; @@ -621,11 +670,45 @@ public class ColumnStatisticsImpl implements ColumnStatistics { @Override public String getMinimum() { - return minimum == null ? null : minimum.toString(); + /* if we have lower bound set (in case of truncation) + getMinimum will be null */ + if(isLowerBoundSet) { + return null; + } else { + return minimum == null ? null : minimum.toString(); + } } @Override public String getMaximum() { + /* if we have upper bound is set (in case of truncation) + getMaximum will be null */ + if(isUpperBoundSet) { + return null; + } else { + return maximum == null ? null : maximum.toString(); + } + } + + /** + * Get the string with + * length = Min(StringStatisticsImpl.MAX_BYTES_RECORDED, getMinimum()) + * + * @return lower bound + */ + @Override + public String getLowerBound() { + return minimum == null ? null : minimum.toString(); + } + + /** + * Get the string with + * length = Min(StringStatisticsImpl.MAX_BYTES_RECORDED, getMaximum()) + * + * @return upper bound + */ + @Override + public String getUpperBound() { return maximum == null ? null : maximum.toString(); } @@ -637,11 +720,19 @@ public class ColumnStatisticsImpl implements ColumnStatistics { @Override public String toString() { StringBuilder buf = new StringBuilder(super.toString()); - if (getNumberOfValues() != 0) { - buf.append(" min: "); - buf.append(getMinimum()); - buf.append(" max: "); - buf.append(getMaximum()); + if (minimum != null) { + if (isLowerBoundSet) { + buf.append(" lower: "); + } else { + buf.append(" min: "); + } + buf.append(getLowerBound()); + if (isUpperBoundSet) { + buf.append(" upper: "); + } else { + buf.append(" max: "); + } + buf.append(getUpperBound()); buf.append(" sum: "); buf.append(sum); } @@ -683,6 +774,119 @@ public class ColumnStatisticsImpl implements ColumnStatistics { result = 31 * result + (int) (sum ^ (sum >>> 32)); return result; } + + /** + * Find the start of the last character that ends in the current string. + * @param text the bytes of the utf-8 + * @param from the first byte location + * @param until the last byte location + * @return the index of the last character + */ + private static int findLastCharacter(byte[] text, int from, int until) { + int posn = until; + /* we don't expect characters more than 5 bytes */ + while (posn >= from) { + if (getCharLength(text[posn]) > 0) { + return posn; + } + posn -= 1; + } + /* beginning of a valid char not found */ + throw new IllegalArgumentException( + "Could not truncate string, beginning of a valid char not found"); + } + + private static int getCodePoint(byte[] source, int from, int len) { + return new String(source, from, len, StandardCharsets.UTF_8) + .codePointAt(0); + } + + private static void appendCodePoint(Text result, int codepoint) { + if (codepoint < 0 || codepoint > 0x1f_ffff) { + throw new IllegalArgumentException("Codepoint out of range " + + codepoint); + } + byte[] buffer = new byte[4]; + if (codepoint < 0x7f) { + buffer[0] = (byte) codepoint; + result.append(buffer, 0, 1); + } else if (codepoint <= 0x7ff) { + buffer[0] = (byte) (0xc0 | (codepoint >> 6)); + buffer[1] = (byte) (0x80 | (codepoint & 0x3f)); + result.append(buffer, 0 , 2); + } else if (codepoint < 0xffff) { + buffer[0] = (byte) (0xe0 | (codepoint >> 12)); + buffer[1] = (byte) (0x80 | ((codepoint >> 6) & 0x3f)); + buffer[2] = (byte) (0x80 | (codepoint & 0x3f)); + result.append(buffer, 0, 3); + } else { + buffer[0] = (byte) (0xf0 | (codepoint >> 18)); + buffer[1] = (byte) (0x80 | ((codepoint >> 12) & 0x3f)); + buffer[2] = (byte) (0x80 | ((codepoint >> 6) & 0x3f)); + buffer[3] = (byte) (0x80 | (codepoint & 0x3f)); + result.append(buffer, 0, 4); + } + } + + /** + * Create a text that is truncated to at most MAX_BYTES_RECORDED at a + * character boundary with the last code point incremented by 1. + * The length is assumed to be greater than MAX_BYTES_RECORDED. + * @param text the text to truncate + * @param from the index of the first character + * @return truncated Text value + */ + private static Text truncateUpperBound(final byte[] text, final int from) { + int followingChar = findLastCharacter(text, from, + from + MAX_BYTES_RECORDED); + int lastChar = findLastCharacter(text, from, followingChar - 1); + Text result = new Text(); + result.set(text, from, lastChar - from); + appendCodePoint(result, + getCodePoint(text, lastChar, followingChar - lastChar) + 1); + return result; + } + + /** + * Create a text that is truncated to at most MAX_BYTES_RECORDED at a + * character boundary. + * The length is assumed to be greater than MAX_BYTES_RECORDED. + * @param text Byte array to truncate + * @param from This is the index of the first character + * @return truncated {@link Text} + */ + private static Text truncateLowerBound(final byte[] text, final int from) { + + int lastChar = findLastCharacter(text, from, from + MAX_BYTES_RECORDED); + Text result = new Text(); + result.set(text, from, lastChar - from); + return result; + } + + /** + * A helper function that returns the length of the UTF-8 character + * IF the given byte is beginning of a valid char. + * In case it is a beginning byte, a value greater than 0 + * is returned (length of character in bytes). + * Else 0 is returned + * @param b + * @return 0 if not beginning of char else length of char in bytes + */ + private static int getCharLength(byte b) { + int len = 0; + if((b & 0b10000000) == 0b00000000 ) { + len = 1; + } else if ((b & 0b11100000) == 0b11000000 ) { + len = 2; + } else if ((b & 0b11110000) == 0b11100000 ) { + len = 3; + } else if ((b & 0b11111000) == 0b11110000 ) { + len = 4; + } else if ((b & 0b11111100) == 0b11111000 ) { + len = 5; + } + return len; + } } protected static final class BinaryStatisticsImpl extends ColumnStatisticsImpl implements http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/core/src/test/org/apache/orc/TestColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java b/java/core/src/test/org/apache/orc/TestColumnStatistics.java index 2045004..30e310c 100644 --- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java +++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java @@ -18,27 +18,13 @@ package org.apache.orc; -import static junit.framework.Assert.assertEquals; -import static org.junit.Assume.assumeTrue; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.sql.Timestamp; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.List; -import java.util.TimeZone; - +import org.apache.commons.lang.RandomStringUtils; +import org.apache.commons.lang.StringEscapeUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.io.DateWritable; -import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.io.Text; import org.apache.orc.impl.ColumnStatisticsImpl; import org.junit.Before; @@ -46,6 +32,17 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.sql.Timestamp; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.TimeZone; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertTrue; + /** * Test ColumnStatisticsImpl for ORC. */ @@ -122,6 +119,245 @@ public class TestColumnStatistics { } @Test + public void testUpperAndLowerBounds() throws Exception { + final TypeDescription schema = TypeDescription.createString(); + + final String test = RandomStringUtils.random(1024+10); + final String fragment = "foo"+test; + final String fragmentLowerBound = "bar"+test; + + + final ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema); + final ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema); + + /* test a scenario for the first max string */ + stats1.updateString(new Text(test)); + + final StringColumnStatistics typed = (StringColumnStatistics) stats1; + final StringColumnStatistics typed2 = (StringColumnStatistics) stats2; + + assertTrue("Upperbound cannot be more than 1024 bytes",1024 >= typed.getUpperBound().getBytes().length); + assertTrue("Lowerbound cannot be more than 1024 bytes",1024 >= typed.getLowerBound().getBytes().length); + + assertEquals(null, typed.getMinimum()); + assertEquals(null, typed.getMaximum()); + + stats1.reset(); + + /* test a scenario for the first max bytes */ + stats1.updateString(test.getBytes(), 0, test.getBytes().length, 0); + + assertTrue("Lowerbound cannot be more than 1024 bytes", 1024 >= typed.getLowerBound().getBytes().length); + assertTrue("Upperbound cannot be more than 1024 bytes", 1024 >= typed.getUpperBound().getBytes().length); + + assertEquals(null, typed.getMinimum()); + assertEquals(null, typed.getMaximum()); + + stats1.reset(); + /* test upper bound - merging */ + stats1.updateString(new Text("bob")); + stats1.updateString(new Text("david")); + stats1.updateString(new Text("charles")); + + stats2.updateString(new Text("anne")); + stats2.updateString(new Text(fragment)); + + assertEquals("anne", typed2.getMinimum()); + assertEquals(null, typed2.getMaximum()); + + stats1.merge(stats2); + + assertEquals("anne", typed.getMinimum()); + assertEquals(null, typed.getMaximum()); + + + /* test lower bound - merging */ + stats1.reset(); + stats2.reset(); + + stats1.updateString(new Text("david")); + stats1.updateString(new Text("charles")); + + stats2.updateString(new Text("jane")); + stats2.updateString(new Text(fragmentLowerBound)); + + stats1.merge(stats2); + + assertEquals(null, typed.getMinimum()); + assertEquals("jane", typed.getMaximum()); + } + + /** + * Test the string truncation with 1 byte characters. The last character + * of the truncated string is 0x7f so that it will expand into a 2 byte + * utf-8 character. + */ + @Test + public void testBoundsAscii() { + StringBuilder buffer = new StringBuilder(); + for(int i=0; i < 256; ++i) { + buffer.append("Owe\u007fn"); + } + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create( + TypeDescription.createString()); + stats.increment(); + stats.updateString(new Text(buffer.toString())); + StringColumnStatistics stringStats = (StringColumnStatistics) stats; + + // make sure that the min/max are null + assertEquals(null, stringStats.getMinimum()); + assertEquals(null, stringStats.getMaximum()); + assertEquals(5 * 256, stringStats.getSum()); + + // and that the lower and upper bound are correct + assertEquals(buffer.substring(0, 1024), stringStats.getLowerBound()); + assertEquals("Owe\u0080", stringStats.getUpperBound().substring(1020)); + assertEquals("count: 1 hasNull: false lower: " + stringStats.getLowerBound() + + " upper: " + stringStats.getUpperBound() + " sum: 1280", + stringStats.toString()); + + // make sure that when we replace the min & max the flags get cleared. + stats.increment(); + stats.updateString(new Text("xxx")); + assertEquals("xxx", stringStats.getMaximum()); + assertEquals("xxx", stringStats.getUpperBound()); + stats.increment(); + stats.updateString(new Text("A")); + assertEquals("A", stringStats.getMinimum()); + assertEquals("A", stringStats.getLowerBound()); + assertEquals("count: 3 hasNull: false min: A max: xxx sum: 1284", + stats.toString()); + } + + /** + * Test truncation with 2 byte utf-8 characters. + */ + @Test + public void testBoundsTwoByte() { + StringBuilder buffer = new StringBuilder(); + final String PATTERN = "\u0080\u07ff\u0432\u0246\u0123"; + for(int i=0; i < 256; ++i) { + buffer.append(PATTERN); + } + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create( + TypeDescription.createString()); + stats.increment(); + stats.updateString(new Text(buffer.toString())); + StringColumnStatistics stringStats = (StringColumnStatistics) stats; + + // make sure that the min/max are null + assertEquals(null, stringStats.getMinimum()); + assertEquals(null, stringStats.getMaximum()); + assertEquals(2 * 5 * 256, stringStats.getSum()); + + // and that the lower and upper bound are correct + // 512 two byte characters fit in 1024 bytes + assertEquals(buffer.substring(0, 512), stringStats.getLowerBound()); + assertEquals(buffer.substring(0, 511), + stringStats.getUpperBound().substring(0, 511)); + assertEquals("\u0800", stringStats.getUpperBound().substring(511)); + } + + /** + * Test truncation with 3 byte utf-8 characters. + */ + @Test + public void testBoundsThreeByte() { + StringBuilder buffer = new StringBuilder(); + final String PATTERN = "\uffff\u0800\u4321\u1234\u3137"; + for(int i=0; i < 256; ++i) { + buffer.append(PATTERN); + } + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create( + TypeDescription.createString()); + stats.increment(); + stats.updateString(new Text(buffer.toString())); + StringColumnStatistics stringStats = (StringColumnStatistics) stats; + + // make sure that the min/max are null + assertEquals(null, stringStats.getMinimum()); + assertEquals(null, stringStats.getMaximum()); + assertEquals(3 * 5 * 256, stringStats.getSum()); + + // and that the lower and upper bound are correct + // 341 three byte characters fit in 1024 bytes + assertEquals(buffer.substring(0, 341), stringStats.getLowerBound()); + assertEquals(buffer.substring(0, 340), + stringStats.getUpperBound().substring(0,340)); + assertEquals("\ud800\udc00", stringStats.getUpperBound().substring(340)); + } + + /** + * Test truncation with 4 byte utf-8 characters. + */ + @Test + public void testBoundsFourByte() { + StringBuilder buffer = new StringBuilder(); + final String PATTERN = "\ud800\udc00\ud801\udc01\ud802\udc02\ud803\udc03\ud804\udc04"; + for(int i=0; i < 256; ++i) { + buffer.append(PATTERN); + } + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create( + TypeDescription.createString()); + stats.increment(); + stats.updateString(new Text(buffer.toString())); + StringColumnStatistics stringStats = (StringColumnStatistics) stats; + + // make sure that the min/max are null + assertEquals(null, stringStats.getMinimum()); + assertEquals(null, stringStats.getMaximum()); + assertEquals(4 * 5 * 256, stringStats.getSum()); + + // and that the lower and upper bound are correct + // 256 four byte characters fit in 1024 bytes + assertEquals(buffer.substring(0, 512), stringStats.getLowerBound()); + assertEquals(buffer.substring(0, 510), + stringStats.getUpperBound().substring(0, 510)); + assertEquals("\\uD800\\uDC01", + StringEscapeUtils.escapeJava(stringStats.getUpperBound().substring(510))); + } + + @Test + public void testUpperBoundCodepointIncrement() { + /* test with characters that use more than one byte */ + final String fragment = "è¼è¨å¿åç°æ¢è¾æçºä½µé岩ãå¤ç¾æ±çæ²æ§æä¹æå æ¸ç´¢ã" + + "坿件æç¨å°äº¤æç¸ä¿®å®®ç±æ¹ä¾¡è¦ãä½å£ä¾å¹¾æ¥æ¬æ±ç¥éæ©ææ±åå·åä¸ç¯å¤ç¬¬åã" + + "ç®¡ä»æå³ç³è·å¸¸æ®æµ·å¶æè¦§æè³æãé£å å¤éµå¹´å¤ªé¡ä¼å¨é¢å¸å®³æç£ã" + + "å åè¼å½åçé æ«è¦å¾©æ¥è»å¿ æãå åç颿ªæ³ä¼ä¼å£ä¸çå¹ç¹å¸³å¹ çºé½è©±éã" + + "è¦ªç¦ææ åéæ³¨èªæå³¶æç´éåæ´¾ä¼éãå¹çµé¿åéé½ç´¹ç¥ç¦è¿½åæ¥ã" + + "æ ¹æ¡åè©±å°æ ¼æ²»ä½ç¸æ©éå¸å¤éä½ã話第åå¹³å½éè² äº¬è¤æ²æ¸å¤çã" + + "å年群辺軽妻æ¢åçæ¨©æçè¦è³ªå¨ç ´å¿ã" + + "नà¥à¤à¥ मà¥à¤à¥à¤¤ बिनà¥à¤¦à¥à¤ समसà¥à¤¯à¤¾à¤ à¤à¤à¤¤à¤°à¤à¤¾à¤°à¥à¤¯à¤à¥à¤·à¤®à¤¤à¤¾ सà¥à¤¨à¤¾ पà¥à¤°à¤¤à¤¿ सà¤à¥à¤à¥à¤ यायà¥à¤à¤¾ दिनाà¤à¤ वातावरण "; + + final String input = fragment + + "मà¥à¤¶à¥à¤à¤¿à¤²à¥ à¤à¥à¤¨à¥à¤¦à¥à¤°à¤¿à¤¯ " + + "लà¤à¤¤à¥ नवà¤à¤¬à¤° पà¥à¤°à¤®à¤¾à¤¨ à¤à¤¯à¥à¤à¤¯à¤¾ समसà¥à¤¯à¤¾à¤ विशà¥à¤µ लियॠसमà¤à¤¤à¥ à¤à¤ªà¤à¥ à¤à¤à¤¤à¥à¤°à¤¿à¤¤ विà¤à¥à¤¨à¥à¤¦à¥à¤°à¤¿à¤¤ सà¥à¤µà¤¤à¤à¤¤à¥à¤° " + + "वà¥à¤¯à¤¾à¤à¥à¤¯à¤¾à¤¨ à¤à¥à¤¦à¤¨à¤à¥à¤·à¤®à¤¤à¤¾ शà¥à¤à¥à¤° हà¥à¤à¤° मà¥à¤à¤¯ à¤à¤°à¤¤à¤¾à¥¤ दरà¥à¤¶à¤¾à¤¤à¤¾ वातावरण विसà¥à¤¤à¤°à¤£à¤à¥à¤·à¤®à¤¤à¤¾ दà¥à¤·à¤¸à¤à¥ पà¥à¤°à¤¾à¤ªà¥à¤¤ समाà¤à¥ " + + "।ठतà¤à¤¨à¥à¤à¥ दरà¥à¤¶à¤¾à¤¤à¤¾ à¤à¤¾à¤°à¥à¤¯à¤à¤°à¥à¤¤à¤¾ बाधा à¤à¤·à¤§à¤¿à¤ समसà¥à¤¯à¤¾à¤ समसà¥à¤¯à¤¾à¤ à¤à¥à¤ªà¤¨à¥à¤¯à¤¤à¤¾ पà¥à¤°à¤¾à¤£ पसà¤à¤¦ " + + "à¤à¥à¤¯à¤¹ नवà¤à¤¬à¤° दà¥à¤·à¤¸à¤à¥ ठनà¥à¤µà¤¾à¤¦à¤ सà¥à¤«à¤¼à¤¤à¤µà¥à¤° समसà¥à¤¯à¤¾à¤ à¤à¥à¤·à¤®à¤¤à¤¾à¥¤ à¤à¤¾à¤°à¥à¤¯ हà¥à¤à¤°\n"; + + final String lowerBound = fragment + + "मà¥à¤¶à¥à¤à¤¿à¤²à¥ à¤à¥à¤¨à¥à¤¦à¥à¤°à¤¿à¤¯ लà¤à¤¤à¥ नवà¤à¤¬à¤° पà¥à¤°à¤®à¤¾à¤¨ à¤à¤¯à¥à¤à¤¯à¤¾ समसà¥à¤¯à¤¾à¤ विशà¥à¤µ लियॠ"; + + final String upperbound = fragment + + "मà¥à¤¶à¥à¤à¤¿à¤²à¥ à¤à¥à¤¨à¥à¤¦à¥à¤°à¤¿à¤¯ लà¤à¤¤à¥ नवà¤à¤¬à¤° पà¥à¤°à¤®à¤¾à¤¨ à¤à¤¯à¥à¤à¤¯à¤¾ समसà¥à¤¯à¤¾à¤ विशà¥à¤µ लियà¥!"; + + final TypeDescription schema = TypeDescription.createString(); + final ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema); + byte[] utf8 = input.getBytes(StandardCharsets.UTF_8); + stats1.updateString(utf8, 0, utf8.length, 1); + + final StringColumnStatistics typed = (StringColumnStatistics) stats1; + + assertEquals(354, typed.getUpperBound().length()); + assertEquals(354, typed.getLowerBound().length()); + + assertEquals(upperbound, typed.getUpperBound()); + assertEquals(lowerBound, typed.getLowerBound()); + } + + + @Test public void testDateMerge() throws Exception { TypeDescription schema = TypeDescription.createDate(); http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/tools/src/test/resources/orc-file-dump-bloomfilter.out ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out index 2a20a71..da79120 100644 --- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out +++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with ORC_135 +File Version: 0.12 with ORC_203 Rows: 21000 Compression: ZLIB Compression size: 4096 http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out index c4fa8bf..4ec83bf 100644 --- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out +++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with ORC_135 +File Version: 0.12 with ORC_203 Rows: 21000 Compression: ZLIB Compression size: 4096 http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out index 9b9dbef..14e9ac3 100644 --- a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out +++ b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with ORC_135 +File Version: 0.12 with ORC_203 Rows: 21000 Compression: ZLIB Compression size: 4096 http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/tools/src/test/resources/orc-file-dump.json ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump.json b/java/tools/src/test/resources/orc-file-dump.json index 72476dd..91c1a2b 100644 --- a/java/tools/src/test/resources/orc-file-dump.json +++ b/java/tools/src/test/resources/orc-file-dump.json @@ -1,7 +1,7 @@ { "fileName": "TestFileDump.testDump.orc", "fileVersion": "0.12", - "writerVersion": "ORC_135", + "writerVersion": "ORC_203", "numberOfRows": 21000, "compression": "ZLIB", "compressionBufferSize": 4096, http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/tools/src/test/resources/orc-file-dump.out ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump.out b/java/tools/src/test/resources/orc-file-dump.out index 2ae99ce..d988155 100644 --- a/java/tools/src/test/resources/orc-file-dump.out +++ b/java/tools/src/test/resources/orc-file-dump.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with ORC_135 +File Version: 0.12 with ORC_203 Rows: 21000 Compression: ZLIB Compression size: 4096 http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/tools/src/test/resources/orc-file-has-null.out ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-has-null.out b/java/tools/src/test/resources/orc-file-has-null.out index ed963dd..4fb7d69 100644 --- a/java/tools/src/test/resources/orc-file-has-null.out +++ b/java/tools/src/test/resources/orc-file-has-null.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with ORC_135 +File Version: 0.12 with ORC_203 Rows: 20000 Compression: ZLIB Compression size: 4096 http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/proto/orc_proto.proto ---------------------------------------------------------------------- diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto index f92e531..e54427d 100644 --- a/proto/orc_proto.proto +++ b/proto/orc_proto.proto @@ -39,6 +39,10 @@ message StringStatistics { optional string maximum = 2; // sum will store the total length of all strings in a stripe optional sint64 sum = 3; + // If the minimum or maximum value was longer than 1024 bytes, store a lower or upper + // bound instead of the minimum or maximum values above. + optional string lowerBound = 4; + optional string upperBound = 5; } message BucketStatistics {
