Github user omalley commented on a diff in the pull request: https://github.com/apache/orc/pull/299#discussion_r208402722 --- Diff: java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java --- @@ -683,6 +787,150 @@ public int hashCode() { result = 31 * result + (int) (sum ^ (sum >>> 32)); return result; } + + /** + * A helper function that truncates the {@link Text} input + * based on {@link #MAX_BYTES_RECORDED} and increments + * the last codepoint by 1. + * @param text + * @return truncated Text value + */ + private static Text truncateUpperBound(final Text text) { + + if(text.getBytes().length > MAX_BYTES_RECORDED) { + return truncateUpperBound(text.getBytes()); + } else { + return text; + } + + } + + /** + * A helper function that truncates the {@link byte[]} input + * based on {@link #MAX_BYTES_RECORDED} and increments + * the last codepoint by 1. + * @param text + * @return truncated Text value + */ + private static Text truncateUpperBound(final byte[] text) { + if(text.length > MAX_BYTES_RECORDED) { + final Text truncated = truncateLowerBound(text); + final byte[] data = truncated.getBytes(); + + int lastCharPosition = data.length - 1; + int offset = 0; + + /* we don't expect characters more than 5 bytes */ + for (int i = 0; i < 5; i++) { + final byte b = data[lastCharPosition]; + offset = getCharLength(b); + + /* found beginning of a valid char */ + if (offset > 0) { + final byte[] lastCharBytes = Arrays + .copyOfRange(text, lastCharPosition, lastCharPosition + offset); + /* last character */ + final String s = new String(lastCharBytes, Charset.forName("UTF-8")); + + /* increment the codepoint of last character */ + int codePoint = s.codePointAt(s.length() - 1); + codePoint++; + final char[] incrementedChars = Character.toChars(codePoint); + + /* convert char array to byte array */ + final CharBuffer charBuffer = CharBuffer.wrap(incrementedChars); + final ByteBuffer byteBuffer = Charset.forName("UTF-8").encode(charBuffer); + final byte[] bytes = Arrays.copyOfRange(byteBuffer.array(), byteBuffer.position(), + byteBuffer.limit()); + + final byte[] result = new byte[lastCharPosition + bytes.length]; + + /* copy truncated array minus last char */ + System.arraycopy(text, 0, result, 0, lastCharPosition); + /* copy last char */ + System.arraycopy(bytes, 0, result, lastCharPosition, bytes.length); + + return new Text(result); --- End diff -- A better pattern is: Text result = new Text(); result.setCapacity(lastCharPosition + bytes.length); result.set(text, 0, lastCharPosition); result.append(bytes, 0, bytes.length); return result;
---