Github user omalley commented on a diff in the pull request: https://github.com/apache/orc/pull/299#discussion_r208401986 --- Diff: java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java --- @@ -683,6 +787,150 @@ public int hashCode() { result = 31 * result + (int) (sum ^ (sum >>> 32)); return result; } + + /** + * A helper function that truncates the {@link Text} input + * based on {@link #MAX_BYTES_RECORDED} and increments + * the last codepoint by 1. + * @param text + * @return truncated Text value + */ + private static Text truncateUpperBound(final Text text) { + + if(text.getBytes().length > MAX_BYTES_RECORDED) { + return truncateUpperBound(text.getBytes()); + } else { + return text; + } + + } + + /** + * A helper function that truncates the {@link byte[]} input + * based on {@link #MAX_BYTES_RECORDED} and increments + * the last codepoint by 1. + * @param text + * @return truncated Text value + */ + private static Text truncateUpperBound(final byte[] text) { + if(text.length > MAX_BYTES_RECORDED) { + final Text truncated = truncateLowerBound(text); + final byte[] data = truncated.getBytes(); + + int lastCharPosition = data.length - 1; + int offset = 0; + + /* we don't expect characters more than 5 bytes */ + for (int i = 0; i < 5; i++) { + final byte b = data[lastCharPosition]; + offset = getCharLength(b); + + /* found beginning of a valid char */ + if (offset > 0) { + final byte[] lastCharBytes = Arrays + .copyOfRange(text, lastCharPosition, lastCharPosition + offset); + /* last character */ + final String s = new String(lastCharBytes, Charset.forName("UTF-8")); + + /* increment the codepoint of last character */ + int codePoint = s.codePointAt(s.length() - 1); + codePoint++; + final char[] incrementedChars = Character.toChars(codePoint); + + /* convert char array to byte array */ + final CharBuffer charBuffer = CharBuffer.wrap(incrementedChars); + final ByteBuffer byteBuffer = Charset.forName("UTF-8").encode(charBuffer); + final byte[] bytes = Arrays.copyOfRange(byteBuffer.array(), byteBuffer.position(), + byteBuffer.limit()); + + final byte[] result = new byte[lastCharPosition + bytes.length]; + + /* copy truncated array minus last char */ + System.arraycopy(text, 0, result, 0, lastCharPosition); + /* copy last char */ + System.arraycopy(bytes, 0, result, lastCharPosition, bytes.length); + + return new Text(result); + + } /* not found keep looking for a beginning byte */ else { + --lastCharPosition; + } + + } + /* beginning of a valid char not found */ + throw new IllegalArgumentException( + "Could not truncate string, beginning of a valid char not found"); + } else { + return new Text(text); + } + } + + private static Text truncateLowerBound(final Text text) { + if(text.getBytes().length > MAX_BYTES_RECORDED) { + return truncateLowerBound(text.getBytes()); + } else { + return text; + } + } + + + private static Text truncateLowerBound(final byte[] text) { + + if(text.length > MAX_BYTES_RECORDED) { + + int truncateLen = MAX_BYTES_RECORDED; + int offset = 0; + + for(int i=0; i<5; i++) { + + byte b = text[truncateLen]; + /* check for the beginning of 1,2,3,4,5 bytes long char */ + offset = getCharLength(b); + + /* found beginning of a valid char */ + if(offset > 0) { + byte[] truncated = Arrays.copyOfRange(text, 0, (truncateLen)); --- End diff -- If you do: Text result = new Text(); result.set(text, 0, truncateLen); return result; you'll have 1 less copy of the bytes.
---