Github user omalley commented on a diff in the pull request:
https://github.com/apache/orc/pull/299#discussion_r208394170
--- Diff: java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
---
@@ -683,6 +787,150 @@ public int hashCode() {
result = 31 * result + (int) (sum ^ (sum >>> 32));
return result;
}
+
+ /**
+ * A helper function that truncates the {@link Text} input
+ * based on {@link #MAX_BYTES_RECORDED} and increments
+ * the last codepoint by 1.
+ * @param text
+ * @return truncated Text value
+ */
+ private static Text truncateUpperBound(final Text text) {
+
+ if(text.getBytes().length > MAX_BYTES_RECORDED) {
+ return truncateUpperBound(text.getBytes());
+ } else {
+ return text;
+ }
+
+ }
+
+ /**
+ * A helper function that truncates the {@link byte[]} input
+ * based on {@link #MAX_BYTES_RECORDED} and increments
+ * the last codepoint by 1.
+ * @param text
+ * @return truncated Text value
+ */
+ private static Text truncateUpperBound(final byte[] text) {
+ if(text.length > MAX_BYTES_RECORDED) {
+ final Text truncated = truncateLowerBound(text);
+ final byte[] data = truncated.getBytes();
+
+ int lastCharPosition = data.length - 1;
+ int offset = 0;
+
+ /* we don't expect characters more than 5 bytes */
+ for (int i = 0; i < 5; i++) {
+ final byte b = data[lastCharPosition];
+ offset = getCharLength(b);
+
+ /* found beginning of a valid char */
+ if (offset > 0) {
+ final byte[] lastCharBytes = Arrays
+ .copyOfRange(text, lastCharPosition, lastCharPosition +
offset);
+ /* last character */
+ final String s = new String(lastCharBytes,
Charset.forName("UTF-8"));
+
+ /* increment the codepoint of last character */
+ int codePoint = s.codePointAt(s.length() - 1);
+ codePoint++;
+ final char[] incrementedChars = Character.toChars(codePoint);
+
+ /* convert char array to byte array */
+ final CharBuffer charBuffer =
CharBuffer.wrap(incrementedChars);
+ final ByteBuffer byteBuffer =
Charset.forName("UTF-8").encode(charBuffer);
+ final byte[] bytes = Arrays.copyOfRange(byteBuffer.array(),
byteBuffer.position(),
+ byteBuffer.limit());
+
+ final byte[] result = new byte[lastCharPosition +
bytes.length];
+
+ /* copy truncated array minus last char */
+ System.arraycopy(text, 0, result, 0, lastCharPosition);
+ /* copy last char */
+ System.arraycopy(bytes, 0, result, lastCharPosition,
bytes.length);
+
+ return new Text(result);
+
+ } /* not found keep looking for a beginning byte */ else {
+ --lastCharPosition;
+ }
+
+ }
+ /* beginning of a valid char not found */
+ throw new IllegalArgumentException(
+ "Could not truncate string, beginning of a valid char not
found");
+ } else {
+ return new Text(text);
+ }
+ }
+
+ private static Text truncateLowerBound(final Text text) {
+ if(text.getBytes().length > MAX_BYTES_RECORDED) {
+ return truncateLowerBound(text.getBytes());
+ } else {
+ return text;
+ }
+ }
+
+
+ private static Text truncateLowerBound(final byte[] text) {
--- End diff --
You need to pass the length in here as well. The byte array may be longer
than the length of the data.
---