Repository: orc Updated Branches: refs/heads/master df20c2139 -> fb815ffdb
ORC-203 - Update StringStatistics to trim long strings to 1024 characters & record they were trimmed Fixes #292 Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/cedd0f91 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/cedd0f91 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/cedd0f91 Branch: refs/heads/master Commit: cedd0f913ad4d91348bb99e70b07d83854ca2719 Parents: df20c21 Author: Sandeep More <m...@apache.org> Authored: Wed Jul 18 09:31:02 2018 -0400 Committer: Owen O'Malley <omal...@apache.org> Committed: Mon Aug 6 14:32:48 2018 -0700 ---------------------------------------------------------------------- .../org/apache/orc/StringColumnStatistics.java | 14 + .../apache/orc/impl/ColumnStatisticsImpl.java | 284 +++++++++++++++++-- .../org/apache/orc/TestColumnStatistics.java | 138 +++++++-- proto/orc_proto.proto | 4 + 4 files changed, 405 insertions(+), 35 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/orc/blob/cedd0f91/java/core/src/java/org/apache/orc/StringColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/StringColumnStatistics.java b/java/core/src/java/org/apache/orc/StringColumnStatistics.java index 936b100..93d197a 100644 --- a/java/core/src/java/org/apache/orc/StringColumnStatistics.java +++ b/java/core/src/java/org/apache/orc/StringColumnStatistics.java @@ -34,6 +34,20 @@ public interface StringColumnStatistics extends ColumnStatistics { String getMaximum(); /** + * Get the string with + * length = Min(StringStatisticsImpl.MAX_BYTES_RECORDED, getMinimum()) + * @return lower bound + */ + String getLowerBound(); + + /** + * Get the string with + * length = Min(StringStatisticsImpl.MAX_BYTES_RECORDED, getMaximum()) + * @return upper bound + */ + String getUpperBound(); + + /** * Get the total length of all strings * @return the sum (total length) */ http://git-wip-us.apache.org/repos/asf/orc/blob/cedd0f91/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index be05d80..0514839 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -17,13 +17,9 @@ */ package org.apache.orc.impl; -import java.sql.Date; -import java.sql.Timestamp; -import java.util.TimeZone; - -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparator; @@ -39,6 +35,14 @@ import org.apache.orc.StringColumnStatistics; import org.apache.orc.TimestampColumnStatistics; import org.apache.orc.TypeDescription; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.Arrays; +import java.util.TimeZone; + public class ColumnStatisticsImpl implements ColumnStatistics { @Override @@ -517,10 +521,14 @@ public class ColumnStatisticsImpl implements ColumnStatistics { protected static final class StringStatisticsImpl extends ColumnStatisticsImpl implements StringColumnStatistics { + public static final int MAX_BYTES_RECORDED = 1024; private Text minimum = null; private Text maximum = null; private long sum = 0; + private boolean isLowerBoundSet = false; + private boolean isUpperBoundSet = false; + StringStatisticsImpl() { } @@ -543,35 +551,73 @@ public class ColumnStatisticsImpl implements ColumnStatistics { super.reset(); minimum = null; maximum = null; + isLowerBoundSet = false; + isUpperBoundSet = false; sum = 0; } @Override public void updateString(Text value) { if (minimum == null) { - maximum = minimum = new Text(value); + if(value.getLength() > MAX_BYTES_RECORDED) { + minimum = truncateLowerBound(value); + maximum = truncateUpperBound(value); + isLowerBoundSet = true; + isUpperBoundSet = true; + } else { + maximum = minimum = new Text(value); + } } else if (minimum.compareTo(value) > 0) { - minimum = new Text(value); + if(value.getLength() > MAX_BYTES_RECORDED) { + minimum = truncateLowerBound(value); + isLowerBoundSet = true; + }else { + minimum = new Text(value); + } } else if (maximum.compareTo(value) < 0) { - maximum = new Text(value); + if(value.getLength() > MAX_BYTES_RECORDED) { + maximum = truncateUpperBound(value); + isUpperBoundSet = true; + } else { + maximum = new Text(value); + } } sum += value.getLength(); } + @Override public void updateString(byte[] bytes, int offset, int length, int repetitions) { + byte[] input = Arrays.copyOfRange(bytes, offset, offset+(length)); if (minimum == null) { - maximum = minimum = new Text(); - maximum.set(bytes, offset, length); + if(length > MAX_BYTES_RECORDED) { + minimum = truncateLowerBound(input); + maximum = truncateUpperBound(input); + isLowerBoundSet = true; + isUpperBoundSet = true; + } else { + maximum = minimum = new Text(); + maximum.set(bytes, offset, length); + } } else if (WritableComparator.compareBytes(minimum.getBytes(), 0, minimum.getLength(), bytes, offset, length) > 0) { - minimum = new Text(); - minimum.set(bytes, offset, length); + if(length > MAX_BYTES_RECORDED) { + minimum = truncateLowerBound(input); + isLowerBoundSet = true; + } else { + minimum = new Text(); + minimum.set(bytes, offset, length); + } } else if (WritableComparator.compareBytes(maximum.getBytes(), 0, maximum.getLength(), bytes, offset, length) < 0) { - maximum = new Text(); - maximum.set(bytes, offset, length); + if(length > MAX_BYTES_RECORDED) { + maximum = truncateUpperBound(input); + isUpperBoundSet = true; + } else { + maximum = new Text(); + maximum.set(bytes, offset, length); + } } sum += (long)length * repetitions; } @@ -584,16 +630,40 @@ public class ColumnStatisticsImpl implements ColumnStatistics { if (str.minimum != null) { maximum = new Text(str.getMaximum()); minimum = new Text(str.getMinimum()); - } else { + } + /* str.minimum == null when lower bound set */ + else if (str.isLowerBoundSet) { + minimum = new Text(str.getLowerBound()); + isLowerBoundSet = true; + + /* check for upper bound before setting max */ + if (str.isUpperBoundSet) { + maximum = new Text(str.getUpperBound()); + isUpperBoundSet = true; + } else { + maximum = new Text(str.getMaximum()); + } + } + else { /* both are empty */ maximum = minimum = null; } } else if (str.minimum != null) { if (minimum.compareTo(str.minimum) > 0) { - minimum = new Text(str.getMinimum()); + if(str.isLowerBoundSet) { + minimum = new Text(str.getLowerBound()); + isLowerBoundSet = true; + } else { + minimum = new Text(str.getMinimum()); + } } if (maximum.compareTo(str.maximum) < 0) { - maximum = new Text(str.getMaximum()); + if(str.isUpperBoundSet) { + maximum = new Text(str.getUpperBound()); + isUpperBoundSet = true; + }else { + maximum = new Text(str.getMaximum()); + } } } sum += str.sum; @@ -621,11 +691,45 @@ public class ColumnStatisticsImpl implements ColumnStatistics { @Override public String getMinimum() { - return minimum == null ? null : minimum.toString(); + /* if we have lower bound set (in case of truncation) + getMinimum will be null */ + if(isLowerBoundSet) { + return null; + } else { + return minimum == null ? null : minimum.toString(); + } } @Override public String getMaximum() { + /* if we have upper bound is set (in case of truncation) + getMaximum will be null */ + if(isUpperBoundSet) { + return null; + } else { + return maximum == null ? null : maximum.toString(); + } + } + + /** + * Get the string with + * length = Min(StringStatisticsImpl.MAX_BYTES_RECORDED, getMinimum()) + * + * @return lower bound + */ + @Override + public String getLowerBound() { + return minimum == null ? null : minimum.toString(); + } + + /** + * Get the string with + * length = Min(StringStatisticsImpl.MAX_BYTES_RECORDED, getMaximum()) + * + * @return upper bound + */ + @Override + public String getUpperBound() { return maximum == null ? null : maximum.toString(); } @@ -683,6 +787,150 @@ public class ColumnStatisticsImpl implements ColumnStatistics { result = 31 * result + (int) (sum ^ (sum >>> 32)); return result; } + + /** + * A helper function that truncates the {@link Text} input + * based on {@link #MAX_BYTES_RECORDED} and increments + * the last codepoint by 1. + * @param text + * @return truncated Text value + */ + private static Text truncateUpperBound(final Text text) { + + if(text.getBytes().length > MAX_BYTES_RECORDED) { + return truncateUpperBound(text.getBytes()); + } else { + return text; + } + + } + + /** + * A helper function that truncates the {@link byte[]} input + * based on {@link #MAX_BYTES_RECORDED} and increments + * the last codepoint by 1. + * @param text + * @return truncated Text value + */ + private static Text truncateUpperBound(final byte[] text) { + if(text.length > MAX_BYTES_RECORDED) { + final Text truncated = truncateLowerBound(text); + final byte[] data = truncated.getBytes(); + + int lastCharPosition = data.length - 1; + int offset = 0; + + /* we don't expect characters more than 5 bytes */ + for (int i = 0; i < 5; i++) { + final byte b = data[lastCharPosition]; + offset = getCharLength(b); + + /* found beginning of a valid char */ + if (offset > 0) { + final byte[] lastCharBytes = Arrays + .copyOfRange(text, lastCharPosition, lastCharPosition + offset); + /* last character */ + final String s = new String(lastCharBytes, Charset.forName("UTF-8")); + + /* increment the codepoint of last character */ + int codePoint = s.codePointAt(s.length() - 1); + codePoint++; + final char[] incrementedChars = Character.toChars(codePoint); + + /* convert char array to byte array */ + final CharBuffer charBuffer = CharBuffer.wrap(incrementedChars); + final ByteBuffer byteBuffer = Charset.forName("UTF-8").encode(charBuffer); + final byte[] bytes = Arrays.copyOfRange(byteBuffer.array(), byteBuffer.position(), + byteBuffer.limit()); + + final byte[] result = new byte[lastCharPosition + bytes.length]; + + /* copy truncated array minus last char */ + System.arraycopy(text, 0, result, 0, lastCharPosition); + /* copy last char */ + System.arraycopy(bytes, 0, result, lastCharPosition, bytes.length); + + return new Text(result); + + } /* not found keep looking for a beginning byte */ else { + --lastCharPosition; + } + + } + /* beginning of a valid char not found */ + throw new IllegalArgumentException( + "Could not truncate string, beginning of a valid char not found"); + } else { + return new Text(text); + } + } + + private static Text truncateLowerBound(final Text text) { + if(text.getBytes().length > MAX_BYTES_RECORDED) { + return truncateLowerBound(text.getBytes()); + } else { + return text; + } + } + + + private static Text truncateLowerBound(final byte[] text) { + + if(text.length > MAX_BYTES_RECORDED) { + + int truncateLen = MAX_BYTES_RECORDED; + int offset = 0; + + for(int i=0; i<5; i++) { + + byte b = text[truncateLen]; + /* check for the beginning of 1,2,3,4,5 bytes long char */ + offset = getCharLength(b); + + /* found beginning of a valid char */ + if(offset > 0) { + byte[] truncated = Arrays.copyOfRange(text, 0, (truncateLen)); + return new Text(truncated); + } else { + /* beginning of a valid char not found decrease the + length of array by 1 and loop */ + --truncateLen; + } + + } + /* beginning of a valid char not found */ + throw new IllegalArgumentException("Could not truncate string, beginning of a valid char not found"); + + } else { + return new Text(text); + } + + } + + /** + * A helper function that returns the length of the UTF-8 character + * IF the given byte is beginning of a valid char. + * In case it is a beginning byte, a value greater than 0 + * is returned (length of character in bytes). + * Else 0 is returned + * @param b + * @return 0 if not beginning of char else length of char in bytes + */ + private static int getCharLength(byte b) { + int len = 0; + if((b & 0b10000000) == 0b00000000 ) { + len = 1; + } else if ((b & 0b11100000) == 0b11000000 ) { + len = 2; + } else if ((b & 0b11110000) == 0b11100000 ) { + len = 3; + } else if ((b & 0b11111000) == 0b11110000 ) { + len = 4; + } else if ((b & 0b11111100) == 0b11111000 ) { + len = 5; + } + return len; + } } protected static final class BinaryStatisticsImpl extends ColumnStatisticsImpl implements http://git-wip-us.apache.org/repos/asf/orc/blob/cedd0f91/java/core/src/test/org/apache/orc/TestColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java b/java/core/src/test/org/apache/orc/TestColumnStatistics.java index 2045004..eae53fc 100644 --- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java +++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java @@ -18,27 +18,12 @@ package org.apache.orc; -import static junit.framework.Assert.assertEquals; -import static org.junit.Assume.assumeTrue; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.sql.Timestamp; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.List; -import java.util.TimeZone; - +import org.apache.commons.lang.RandomStringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.io.DateWritable; -import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.io.Text; import org.apache.orc.impl.ColumnStatisticsImpl; import org.junit.Before; @@ -46,6 +31,16 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; +import java.io.File; +import java.sql.Timestamp; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.TimeZone; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertTrue; + /** * Test ColumnStatisticsImpl for ORC. */ @@ -122,6 +117,115 @@ public class TestColumnStatistics { } @Test + public void testUpperAndLowerBounds() throws Exception { + final TypeDescription schema = TypeDescription.createString(); + + final String test = RandomStringUtils.random(1024+10); + final String fragment = "foo"+test; + final String fragmentLowerBound = "bar"+test; + + + final ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema); + final ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema); + + /* test a scenario for the first max string */ + stats1.updateString(new Text(test)); + + final StringColumnStatistics typed = (StringColumnStatistics) stats1; + final StringColumnStatistics typed2 = (StringColumnStatistics) stats2; + + assertTrue("Upperbound cannot be more than 1024 bytes",1024 >= typed.getUpperBound().getBytes().length); + assertTrue("Lowerbound cannot be more than 1024 bytes",1024 >= typed.getLowerBound().getBytes().length); + + assertEquals(null, typed.getMinimum()); + assertEquals(null, typed.getMaximum()); + + stats1.reset(); + + /* test a scenario for the first max bytes */ + stats1.updateString(test.getBytes(), 0, test.getBytes().length, 0); + + assertTrue("Lowerbound cannot be more than 1024 bytes", 1024 >= typed.getLowerBound().getBytes().length); + assertTrue("Upperbound cannot be more than 1024 bytes", 1024 >= typed.getUpperBound().getBytes().length); + + assertEquals(null, typed.getMinimum()); + assertEquals(null, typed.getMaximum()); + + stats1.reset(); + /* test upper bound - merging */ + stats1.updateString(new Text("bob")); + stats1.updateString(new Text("david")); + stats1.updateString(new Text("charles")); + + stats2.updateString(new Text("anne")); + stats2.updateString(new Text(fragment)); + + assertEquals("anne", typed2.getMinimum()); + assertEquals(null, typed2.getMaximum()); + + stats1.merge(stats2); + + assertEquals("anne", typed.getMinimum()); + assertEquals(null, typed.getMaximum()); + + + /* test lower bound - merging */ + stats1.reset(); + stats2.reset(); + + stats1.updateString(new Text("david")); + stats1.updateString(new Text("charles")); + + stats2.updateString(new Text("jane")); + stats2.updateString(new Text(fragmentLowerBound)); + + stats1.merge(stats2); + + assertEquals(null, typed.getMinimum()); + assertEquals("jane", typed.getMaximum()); + } + + @Test + public void testUpperBoundCodepointIncrement() { + /* test with characters that use more than one byte */ + final String fragment = "è¼è¨å¿åç°æ¢è¾æçºä½µé岩ãå¤ç¾æ±çæ²æ§æä¹æå æ¸ç´¢ã" + + "坿件æç¨å°äº¤æç¸ä¿®å®®ç±æ¹ä¾¡è¦ãä½å£ä¾å¹¾æ¥æ¬æ±ç¥éæ©ææ±åå·åä¸ç¯å¤ç¬¬åã" + + "ç®¡ä»æå³ç³è·å¸¸æ®æµ·å¶æè¦§æè³æãé£å å¤éµå¹´å¤ªé¡ä¼å¨é¢å¸å®³æç£ã" + + "å åè¼å½åçé æ«è¦å¾©æ¥è»å¿ æãå åç颿ªæ³ä¼ä¼å£ä¸çå¹ç¹å¸³å¹ çºé½è©±éã" + + "è¦ªç¦ææ åéæ³¨èªæå³¶æç´éåæ´¾ä¼éãå¹çµé¿åéé½ç´¹ç¥ç¦è¿½åæ¥ã" + + "æ ¹æ¡åè©±å°æ ¼æ²»ä½ç¸æ©éå¸å¤éä½ã話第åå¹³å½éè² äº¬è¤æ²æ¸å¤çã" + + "å年群辺軽妻æ¢åçæ¨©æçè¦è³ªå¨ç ´å¿ã" + + "नà¥à¤à¥ मà¥à¤à¥à¤¤ बिनà¥à¤¦à¥à¤ समसà¥à¤¯à¤¾à¤ à¤à¤à¤¤à¤°à¤à¤¾à¤°à¥à¤¯à¤à¥à¤·à¤®à¤¤à¤¾ सà¥à¤¨à¤¾ पà¥à¤°à¤¤à¤¿ सà¤à¥à¤à¥à¤ यायà¥à¤à¤¾ दिनाà¤à¤ वातावरण "; + + final String input = fragment + + "मà¥à¤¶à¥à¤à¤¿à¤²à¥ à¤à¥à¤¨à¥à¤¦à¥à¤°à¤¿à¤¯ " + + "लà¤à¤¤à¥ नवà¤à¤¬à¤° पà¥à¤°à¤®à¤¾à¤¨ à¤à¤¯à¥à¤à¤¯à¤¾ समसà¥à¤¯à¤¾à¤ विशà¥à¤µ लियॠसमà¤à¤¤à¥ à¤à¤ªà¤à¥ à¤à¤à¤¤à¥à¤°à¤¿à¤¤ विà¤à¥à¤¨à¥à¤¦à¥à¤°à¤¿à¤¤ सà¥à¤µà¤¤à¤à¤¤à¥à¤° " + + "वà¥à¤¯à¤¾à¤à¥à¤¯à¤¾à¤¨ à¤à¥à¤¦à¤¨à¤à¥à¤·à¤®à¤¤à¤¾ शà¥à¤à¥à¤° हà¥à¤à¤° मà¥à¤à¤¯ à¤à¤°à¤¤à¤¾à¥¤ दरà¥à¤¶à¤¾à¤¤à¤¾ वातावरण विसà¥à¤¤à¤°à¤£à¤à¥à¤·à¤®à¤¤à¤¾ दà¥à¤·à¤¸à¤à¥ पà¥à¤°à¤¾à¤ªà¥à¤¤ समाà¤à¥ " + + "।ठतà¤à¤¨à¥à¤à¥ दरà¥à¤¶à¤¾à¤¤à¤¾ à¤à¤¾à¤°à¥à¤¯à¤à¤°à¥à¤¤à¤¾ बाधा à¤à¤·à¤§à¤¿à¤ समसà¥à¤¯à¤¾à¤ समसà¥à¤¯à¤¾à¤ à¤à¥à¤ªà¤¨à¥à¤¯à¤¤à¤¾ पà¥à¤°à¤¾à¤£ पसà¤à¤¦ " + + "à¤à¥à¤¯à¤¹ नवà¤à¤¬à¤° दà¥à¤·à¤¸à¤à¥ ठनà¥à¤µà¤¾à¤¦à¤ सà¥à¤«à¤¼à¤¤à¤µà¥à¤° समसà¥à¤¯à¤¾à¤ à¤à¥à¤·à¤®à¤¤à¤¾à¥¤ à¤à¤¾à¤°à¥à¤¯ हà¥à¤à¤°\n"; + + final String lowerBound = fragment + + "मà¥à¤¶à¥à¤à¤¿à¤²à¥ à¤à¥à¤¨à¥à¤¦à¥à¤°à¤¿à¤¯ लà¤à¤¤à¥ नवà¤à¤¬à¤° पà¥à¤°à¤®à¤¾à¤¨ à¤à¤¯à¥à¤à¤¯à¤¾ समसà¥à¤¯à¤¾à¤ विशà¥à¤µ लियॠ"; + + final String upperbound = fragment + + "मà¥à¤¶à¥à¤à¤¿à¤²à¥ à¤à¥à¤¨à¥à¤¦à¥à¤°à¤¿à¤¯ लà¤à¤¤à¥ नवà¤à¤¬à¤° पà¥à¤°à¤®à¤¾à¤¨ à¤à¤¯à¥à¤à¤¯à¤¾ समसà¥à¤¯à¤¾à¤ विशà¥à¤µ लियà¥!"; + + final TypeDescription schema = TypeDescription.createString(); + final ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema); + + stats1.updateString(input.getBytes(), 0, input.getBytes().length, 1); + + final StringColumnStatistics typed = (StringColumnStatistics) stats1; + + assertEquals(1022, typed.getUpperBound().getBytes().length); + assertEquals(1022, typed.getLowerBound().getBytes().length); + + assertEquals(upperbound, typed.getUpperBound()); + assertEquals(lowerBound, typed.getLowerBound()); + } + + + @Test public void testDateMerge() throws Exception { TypeDescription schema = TypeDescription.createDate(); http://git-wip-us.apache.org/repos/asf/orc/blob/cedd0f91/proto/orc_proto.proto ---------------------------------------------------------------------- diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto index f92e531..e54427d 100644 --- a/proto/orc_proto.proto +++ b/proto/orc_proto.proto @@ -39,6 +39,10 @@ message StringStatistics { optional string maximum = 2; // sum will store the total length of all strings in a stripe optional sint64 sum = 3; + // If the minimum or maximum value was longer than 1024 bytes, store a lower or upper + // bound instead of the minimum or maximum values above. + optional string lowerBound = 4; + optional string upperBound = 5; } message BucketStatistics {