[GitHub] orc pull request #299: ORC-203 - Update StringStatistics to trim long string...

omalley Tue, 07 Aug 2018 15:17:22 -0700

Github user omalley commented on a diff in the pull request:

    https://github.com/apache/orc/pull/299#discussion_r208401986
  
    --- Diff: java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java 
---
    @@ -683,6 +787,150 @@ public int hashCode() {
           result = 31 * result + (int) (sum ^ (sum >>> 32));
           return result;
         }
    +
    +    /**
    +     * A helper function that truncates the {@link Text} input
    +     * based on {@link #MAX_BYTES_RECORDED} and increments
    +     * the last codepoint by 1.
    +     * @param text
    +     * @return truncated Text value
    +     */
    +    private static Text truncateUpperBound(final Text text) {
    +
    +      if(text.getBytes().length > MAX_BYTES_RECORDED) {
    +        return truncateUpperBound(text.getBytes());
    +      } else {
    +        return text;
    +      }
    +
    +    }
    +
    +    /**
    +     * A helper function that truncates the {@link byte[]} input
    +     * based on {@link #MAX_BYTES_RECORDED} and increments
    +     * the last codepoint by 1.
    +     * @param text
    +     * @return truncated Text value
    +     */
    +    private static Text truncateUpperBound(final byte[] text) {
    +      if(text.length > MAX_BYTES_RECORDED) {
    +        final Text truncated = truncateLowerBound(text);
    +        final byte[] data = truncated.getBytes();
    +
    +        int lastCharPosition = data.length - 1;
    +        int offset = 0;
    +
    +        /* we don't expect characters more than 5 bytes */
    +        for (int i = 0; i < 5; i++) {
    +          final byte b = data[lastCharPosition];
    +          offset = getCharLength(b);
    +
    +          /* found beginning of a valid char */
    +          if (offset > 0) {
    +            final byte[] lastCharBytes = Arrays
    +                .copyOfRange(text, lastCharPosition, lastCharPosition + 
offset);
    +            /* last character */
    +            final String s = new String(lastCharBytes, 
Charset.forName("UTF-8"));
    +
    +            /* increment the codepoint of last character */
    +            int codePoint = s.codePointAt(s.length() - 1);
    +            codePoint++;
    +            final char[] incrementedChars = Character.toChars(codePoint);
    +
    +            /* convert char array to byte array */
    +            final CharBuffer charBuffer = 
CharBuffer.wrap(incrementedChars);
    +            final ByteBuffer byteBuffer = 
Charset.forName("UTF-8").encode(charBuffer);
    +            final byte[] bytes = Arrays.copyOfRange(byteBuffer.array(), 
byteBuffer.position(),
    +                byteBuffer.limit());
    +
    +            final byte[] result = new byte[lastCharPosition + 
bytes.length];
    +
    +            /* copy truncated array minus last char */
    +            System.arraycopy(text, 0, result, 0, lastCharPosition);
    +            /* copy last char */
    +            System.arraycopy(bytes, 0, result, lastCharPosition, 
bytes.length);
    +
    +            return new Text(result);
    +
    +          } /* not found keep looking for a beginning byte */ else {
    +            --lastCharPosition;
    +          }
    +
    +        }
    +        /* beginning of a valid char not found */
    +        throw new IllegalArgumentException(
    +            "Could not truncate string, beginning of a valid char not 
found");
    +      } else {
    +        return new Text(text);
    +      }
    +    }
    +
    +    private static Text truncateLowerBound(final Text text) {
    +      if(text.getBytes().length > MAX_BYTES_RECORDED) {
    +        return truncateLowerBound(text.getBytes());
    +      } else {
    +        return text;
    +      }
    +    }
    +
    +
    +    private static Text truncateLowerBound(final byte[] text) {
    +
    +      if(text.length > MAX_BYTES_RECORDED) {
    +
    +        int truncateLen = MAX_BYTES_RECORDED;
    +        int offset = 0;
    +
    +        for(int i=0; i<5; i++) {
    +
    +          byte b = text[truncateLen];
    +          /* check for the beginning of 1,2,3,4,5 bytes long char */
    +          offset = getCharLength(b);
    +
    +          /* found beginning of a valid char */
    +          if(offset > 0) {
    +            byte[] truncated = Arrays.copyOfRange(text, 0, (truncateLen));
    --- End diff --
    
    If you do:
    Text result = new Text();
    result.set(text, 0, truncateLen);
    return result;
    
    you'll have 1 less copy of the bytes.

---

[GitHub] orc pull request #299: ORC-203 - Update StringStatistics to trim long string...

Reply via email to