orc git commit: ORC-203: Trim StringStatistics to a maximum size of 1024 bytes.

omalley Thu, 13 Sep 2018 06:28:15 -0700

Repository: orc
Updated Branches:
  refs/heads/master f47e02cfb -> 5ce07a149



ORC-203: Trim StringStatistics to a maximum size of 1024 bytes.

Fixes #299

Signed-off-by: Owen O'Malley <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/5ce07a14
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/5ce07a14
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/5ce07a14

Branch: refs/heads/master
Commit: 5ce07a14947bac30c8b0abd3a78dd7f7412c792c
Parents: f47e02c
Author: Sandeep More <[email protected]>
Authored: Wed Jul 18 09:31:02 2018 -0400
Committer: Owen O'Malley <[email protected]>
Committed: Thu Sep 13 09:27:25 2018 -0400

----------------------------------------------------------------------
 java/core/src/java/org/apache/orc/OrcFile.java  |   3 +-
 .../org/apache/orc/StringColumnStatistics.java  |  16 ++
 .../apache/orc/impl/ColumnStatisticsImpl.java   | 260 ++++++++++++++++--
 .../org/apache/orc/TestColumnStatistics.java    | 270 +++++++++++++++++--
 .../resources/orc-file-dump-bloomfilter.out     |   2 +-
 .../resources/orc-file-dump-bloomfilter2.out    |   2 +-
 .../orc-file-dump-dictionary-threshold.out      |   2 +-
 .../tools/src/test/resources/orc-file-dump.json |   2 +-
 java/tools/src/test/resources/orc-file-dump.out |   2 +-
 .../src/test/resources/orc-file-has-null.out    |   2 +-
 proto/orc_proto.proto                           |   4 +
 11 files changed, 513 insertions(+), 52 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/core/src/java/org/apache/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcFile.java 
b/java/core/src/java/org/apache/orc/OrcFile.java
index 33aa431..5a25725 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -171,6 +171,7 @@ public class OrcFile {
     HIVE_13083(WriterImplementation.ORC_JAVA, 4), // decimals write present 
stream correctly
     ORC_101(WriterImplementation.ORC_JAVA, 5),   // bloom filters use utf8
     ORC_135(WriterImplementation.ORC_JAVA, 6),   // timestamp stats use utc
+    ORC_203(WriterImplementation.ORC_JAVA, 7),   // trim long strings & record 
they were trimmed
 
     // C++ ORC Writer
     ORC_CPP_ORIGINAL(WriterImplementation.ORC_CPP, 6),
@@ -254,7 +255,7 @@ public class OrcFile {
   /**
    * The WriterVersion for this version of the software.
    */
-  public static final WriterVersion CURRENT_WRITER = WriterVersion.ORC_135;
+  public static final WriterVersion CURRENT_WRITER = WriterVersion.ORC_203;
 
   public enum EncodingStrategy {
     SPEED, COMPRESSION

http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/core/src/java/org/apache/orc/StringColumnStatistics.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/StringColumnStatistics.java 
b/java/core/src/java/org/apache/orc/StringColumnStatistics.java
index 936b100..9ecbdc9 100644
--- a/java/core/src/java/org/apache/orc/StringColumnStatistics.java
+++ b/java/core/src/java/org/apache/orc/StringColumnStatistics.java
@@ -34,6 +34,22 @@ public interface StringColumnStatistics extends 
ColumnStatistics {
   String getMaximum();
 
   /**
+   * Get the lower bound of the values in this column.
+   * The value may be truncated to at most
+   * MAX_BYTES_RECORDED.
+   * @return lower bound
+   */
+  String getLowerBound();
+
+  /**
+   * Get the upper bound of the values in this column.
+   * The value may be truncated to at most
+   * MAX_BYTES_RECORDED.
+   * @return upper bound
+   */
+  String getUpperBound();
+
+  /**
    * Get the total length of all strings
    * @return the sum (total length)
    */

http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java 
b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
index be05d80..1b8c801 100644
--- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
@@ -17,13 +17,9 @@
  */
 package org.apache.orc.impl;
 
-import java.sql.Date;
-import java.sql.Timestamp;
-import java.util.TimeZone;
-
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
 import org.apache.hadoop.hive.common.type.HiveDecimal;
 import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.WritableComparator;
@@ -39,6 +35,15 @@ import org.apache.orc.StringColumnStatistics;
 import org.apache.orc.TimestampColumnStatistics;
 import org.apache.orc.TypeDescription;
 
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.TimeZone;
+
 public class ColumnStatisticsImpl implements ColumnStatistics {
 
   @Override
@@ -517,10 +522,14 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
 
   protected static final class StringStatisticsImpl extends 
ColumnStatisticsImpl
       implements StringColumnStatistics {
+    public static final int MAX_BYTES_RECORDED = 1024;
     private Text minimum = null;
     private Text maximum = null;
     private long sum = 0;
 
+    private boolean isLowerBoundSet = false;
+    private boolean isUpperBoundSet = false;
+
     StringStatisticsImpl() {
     }
 
@@ -543,35 +552,51 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
       super.reset();
       minimum = null;
       maximum = null;
+      isLowerBoundSet = false;
+      isUpperBoundSet = false;
       sum = 0;
     }
 
     @Override
     public void updateString(Text value) {
-      if (minimum == null) {
-        maximum = minimum = new Text(value);
-      } else if (minimum.compareTo(value) > 0) {
-        minimum = new Text(value);
-      } else if (maximum.compareTo(value) < 0) {
-        maximum = new Text(value);
-      }
-      sum += value.getLength();
+      updateString(value.getBytes(), 0, value.getLength(), 1);
     }
 
     @Override
     public void updateString(byte[] bytes, int offset, int length,
                              int repetitions) {
       if (minimum == null) {
-        maximum = minimum = new Text();
-        maximum.set(bytes, offset, length);
+        if(length > MAX_BYTES_RECORDED) {
+          minimum = truncateLowerBound(bytes, offset);
+          maximum = truncateUpperBound(bytes, offset);
+          isLowerBoundSet = true;
+          isUpperBoundSet = true;
+        } else {
+          maximum = minimum = new Text();
+          maximum.set(bytes, offset, length);
+          isLowerBoundSet = false;
+          isUpperBoundSet = false;
+        }
       } else if (WritableComparator.compareBytes(minimum.getBytes(), 0,
           minimum.getLength(), bytes, offset, length) > 0) {
-        minimum = new Text();
-        minimum.set(bytes, offset, length);
+        if(length > MAX_BYTES_RECORDED) {
+          minimum = truncateLowerBound(bytes, offset);
+          isLowerBoundSet = true;
+        } else {
+          minimum = new Text();
+          minimum.set(bytes, offset, length);
+          isLowerBoundSet = false;
+        }
       } else if (WritableComparator.compareBytes(maximum.getBytes(), 0,
           maximum.getLength(), bytes, offset, length) < 0) {
-        maximum = new Text();
-        maximum.set(bytes, offset, length);
+        if(length > MAX_BYTES_RECORDED) {
+          maximum = truncateUpperBound(bytes, offset);
+          isUpperBoundSet = true;
+        } else {
+          maximum = new Text();
+          maximum.set(bytes, offset, length);
+          isUpperBoundSet = false;
+        }
       }
       sum += (long)length * repetitions;
     }
@@ -584,16 +609,40 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
           if (str.minimum != null) {
             maximum = new Text(str.getMaximum());
             minimum = new Text(str.getMinimum());
-          } else {
+          }
+          /* str.minimum == null when lower bound set */
+          else if (str.isLowerBoundSet) {
+            minimum = new Text(str.getLowerBound());
+            isLowerBoundSet = str.isLowerBoundSet;
+
+            /* check for upper bound before setting max */
+            if (str.isUpperBoundSet) {
+              maximum = new Text(str.getUpperBound());
+              isUpperBoundSet = str.isUpperBoundSet;
+            } else {
+              maximum = new Text(str.getMaximum());
+            }
+          }
+          else {
           /* both are empty */
             maximum = minimum = null;
           }
         } else if (str.minimum != null) {
           if (minimum.compareTo(str.minimum) > 0) {
-            minimum = new Text(str.getMinimum());
+            if(str.isLowerBoundSet) {
+              minimum = new Text(str.getLowerBound());
+              isLowerBoundSet = str.isLowerBoundSet;
+            } else {
+              minimum = new Text(str.getMinimum());
+            }
           }
           if (maximum.compareTo(str.maximum) < 0) {
-            maximum = new Text(str.getMaximum());
+            if(str.isUpperBoundSet) {
+              maximum = new Text(str.getUpperBound());
+              isUpperBoundSet = str.isUpperBoundSet;
+            }else {
+              maximum = new Text(str.getMaximum());
+            }
           }
         }
         sum += str.sum;
@@ -621,11 +670,45 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
 
     @Override
     public String getMinimum() {
-      return minimum == null ? null : minimum.toString();
+      /* if we have lower bound set (in case of truncation)
+      getMinimum will be null */
+      if(isLowerBoundSet) {
+        return null;
+      } else {
+        return minimum == null ? null : minimum.toString();
+      }
     }
 
     @Override
     public String getMaximum() {
+      /* if we have upper bound is set (in case of truncation)
+      getMaximum will be null */
+      if(isUpperBoundSet) {
+        return null;
+      } else {
+        return maximum == null ? null : maximum.toString();
+      }
+    }
+
+    /**
+     * Get the string with
+     * length = Min(StringStatisticsImpl.MAX_BYTES_RECORDED, getMinimum())
+     *
+     * @return lower bound
+     */
+    @Override
+    public String getLowerBound() {
+      return minimum == null ? null : minimum.toString();
+    }
+
+    /**
+     * Get the string with
+     * length = Min(StringStatisticsImpl.MAX_BYTES_RECORDED, getMaximum())
+     *
+     * @return upper bound
+     */
+    @Override
+    public String getUpperBound() {
       return maximum == null ? null : maximum.toString();
     }
 
@@ -637,11 +720,19 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
     @Override
     public String toString() {
       StringBuilder buf = new StringBuilder(super.toString());
-      if (getNumberOfValues() != 0) {
-        buf.append(" min: ");
-        buf.append(getMinimum());
-        buf.append(" max: ");
-        buf.append(getMaximum());
+      if (minimum != null) {
+        if (isLowerBoundSet) {
+          buf.append(" lower: ");
+        } else {
+          buf.append(" min: ");
+        }
+        buf.append(getLowerBound());
+        if (isUpperBoundSet) {
+          buf.append(" upper: ");
+        } else {
+          buf.append(" max: ");
+        }
+        buf.append(getUpperBound());
         buf.append(" sum: ");
         buf.append(sum);
       }
@@ -683,6 +774,119 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
       result = 31 * result + (int) (sum ^ (sum >>> 32));
       return result;
     }
+
+    /**
+     * Find the start of the last character that ends in the current string.
+     * @param text the bytes of the utf-8
+     * @param from the first byte location
+     * @param until the last byte location
+     * @return the index of the last character
+     */
+    private static int findLastCharacter(byte[] text, int from, int until) {
+      int posn = until;
+      /* we don't expect characters more than 5 bytes */
+      while (posn >= from) {
+        if (getCharLength(text[posn]) > 0) {
+          return posn;
+        }
+        posn -= 1;
+      }
+      /* beginning of a valid char not found */
+      throw new IllegalArgumentException(
+          "Could not truncate string, beginning of a valid char not found");
+    }
+
+    private static int getCodePoint(byte[] source, int from, int len) {
+      return new String(source, from, len, StandardCharsets.UTF_8)
+          .codePointAt(0);
+    }
+
+    private static void appendCodePoint(Text result, int codepoint) {
+      if (codepoint < 0 || codepoint > 0x1f_ffff) {
+        throw new IllegalArgumentException("Codepoint out of range " +
+            codepoint);
+      }
+      byte[] buffer = new byte[4];
+      if (codepoint < 0x7f) {
+        buffer[0] = (byte) codepoint;
+        result.append(buffer, 0, 1);
+      } else if (codepoint <= 0x7ff) {
+        buffer[0] = (byte) (0xc0 | (codepoint >> 6));
+        buffer[1] = (byte) (0x80 | (codepoint & 0x3f));
+        result.append(buffer, 0 , 2);
+      } else if (codepoint < 0xffff) {
+        buffer[0] = (byte) (0xe0 | (codepoint >> 12));
+        buffer[1] = (byte) (0x80 | ((codepoint >> 6) & 0x3f));
+        buffer[2] = (byte) (0x80 | (codepoint & 0x3f));
+        result.append(buffer, 0, 3);
+      } else {
+        buffer[0] = (byte) (0xf0 | (codepoint >> 18));
+        buffer[1] = (byte) (0x80 | ((codepoint >> 12) & 0x3f));
+        buffer[2] = (byte) (0x80 | ((codepoint >> 6) & 0x3f));
+        buffer[3] = (byte) (0x80 | (codepoint & 0x3f));
+        result.append(buffer, 0, 4);
+      }
+    }
+
+    /**
+     * Create a text that is truncated to at most MAX_BYTES_RECORDED at a
+     * character boundary with the last code point incremented by 1.
+     * The length is assumed to be greater than MAX_BYTES_RECORDED.
+     * @param text the text to truncate
+     * @param from the index of the first character
+     * @return truncated Text value
+     */
+    private static Text truncateUpperBound(final byte[] text, final int from) {
+      int followingChar = findLastCharacter(text, from,
+          from + MAX_BYTES_RECORDED);
+      int lastChar = findLastCharacter(text, from, followingChar - 1);
+      Text result = new Text();
+      result.set(text, from, lastChar - from);
+      appendCodePoint(result,
+          getCodePoint(text, lastChar, followingChar - lastChar) + 1);
+      return result;
+    }
+
+    /**
+     * Create a text that is truncated to at most MAX_BYTES_RECORDED at a
+     * character boundary.
+     * The length is assumed to be greater than MAX_BYTES_RECORDED.
+     * @param text Byte array to truncate
+     * @param from This is the index of the first character
+     * @return truncated {@link Text}
+     */
+    private static Text truncateLowerBound(final byte[] text, final int from) {
+
+      int lastChar = findLastCharacter(text, from, from + MAX_BYTES_RECORDED);
+      Text result = new Text();
+      result.set(text, from, lastChar - from);
+      return result;
+    }
+
+    /**
+     * A helper function that returns the length of the UTF-8 character
+     * IF the given byte is beginning of a valid char.
+     * In case it is a beginning byte, a value greater than 0
+     * is returned (length of character in bytes).
+     * Else 0 is returned
+     * @param b
+     * @return 0 if not beginning of char else length of char in bytes
+     */
+    private static int getCharLength(byte b) {
+      int len = 0;
+      if((b & 0b10000000) == 0b00000000 ) {
+        len = 1;
+      } else if ((b & 0b11100000) == 0b11000000 ) {
+        len = 2;
+      } else if ((b & 0b11110000) == 0b11100000 ) {
+        len = 3;
+      } else if ((b & 0b11111000) == 0b11110000 ) {
+        len = 4;
+      } else if ((b & 0b11111100) == 0b11111000 ) {
+        len = 5;
+      }
+      return len;
+    }
   }
 
   protected static final class BinaryStatisticsImpl extends 
ColumnStatisticsImpl implements

http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/core/src/test/org/apache/orc/TestColumnStatistics.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java 
b/java/core/src/test/org/apache/orc/TestColumnStatistics.java
index 2045004..30e310c 100644
--- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java
+++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java
@@ -18,27 +18,13 @@
 
 package org.apache.orc;
 
-import static junit.framework.Assert.assertEquals;
-import static org.junit.Assume.assumeTrue;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.PrintStream;
-import java.sql.Timestamp;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.List;
-import java.util.TimeZone;
-
+import org.apache.commons.lang.RandomStringUtils;
+import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.orc.impl.ColumnStatisticsImpl;
 import org.junit.Before;
@@ -46,6 +32,17 @@ import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TestName;
 
+import java.io.File;
+import java.nio.charset.StandardCharsets;
+import java.sql.Timestamp;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Arrays;
+import java.util.TimeZone;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertTrue;
+
 /**
  * Test ColumnStatisticsImpl for ORC.
  */
@@ -122,6 +119,245 @@ public class TestColumnStatistics {
   }
 
   @Test
+  public void testUpperAndLowerBounds() throws Exception {
+    final TypeDescription schema = TypeDescription.createString();
+
+    final String test = RandomStringUtils.random(1024+10);
+    final String fragment = "foo"+test;
+    final String fragmentLowerBound = "bar"+test;
+
+
+    final ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+    final ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+
+    /* test a scenario for the first max string */
+    stats1.updateString(new Text(test));
+
+    final StringColumnStatistics typed = (StringColumnStatistics) stats1;
+    final StringColumnStatistics typed2 = (StringColumnStatistics) stats2;
+
+    assertTrue("Upperbound cannot be more than 1024 bytes",1024 >= 
typed.getUpperBound().getBytes().length);
+    assertTrue("Lowerbound cannot be more than 1024 bytes",1024 >= 
typed.getLowerBound().getBytes().length);
+
+    assertEquals(null, typed.getMinimum());
+    assertEquals(null, typed.getMaximum());
+
+    stats1.reset();
+
+    /* test a scenario for the first max bytes */
+    stats1.updateString(test.getBytes(), 0, test.getBytes().length, 0);
+
+    assertTrue("Lowerbound cannot be more than 1024 bytes", 1024 >= 
typed.getLowerBound().getBytes().length);
+    assertTrue("Upperbound cannot be more than 1024 bytes", 1024 >= 
typed.getUpperBound().getBytes().length);
+
+    assertEquals(null, typed.getMinimum());
+    assertEquals(null, typed.getMaximum());
+
+    stats1.reset();
+    /* test upper bound - merging  */
+    stats1.updateString(new Text("bob"));
+    stats1.updateString(new Text("david"));
+    stats1.updateString(new Text("charles"));
+
+    stats2.updateString(new Text("anne"));
+    stats2.updateString(new Text(fragment));
+
+    assertEquals("anne", typed2.getMinimum());
+    assertEquals(null, typed2.getMaximum());
+
+    stats1.merge(stats2);
+
+    assertEquals("anne", typed.getMinimum());
+    assertEquals(null, typed.getMaximum());
+
+
+    /* test lower bound - merging  */
+    stats1.reset();
+    stats2.reset();
+
+    stats1.updateString(new Text("david"));
+    stats1.updateString(new Text("charles"));
+
+    stats2.updateString(new Text("jane"));
+    stats2.updateString(new Text(fragmentLowerBound));
+
+    stats1.merge(stats2);
+
+    assertEquals(null, typed.getMinimum());
+    assertEquals("jane", typed.getMaximum());
+  }
+
+  /**
+   * Test the string truncation with 1 byte characters. The last character
+   * of the truncated string is 0x7f so that it will expand into a 2 byte
+   * utf-8 character.
+   */
+  @Test
+  public void testBoundsAscii() {
+    StringBuilder buffer = new StringBuilder();
+    for(int i=0; i < 256; ++i) {
+      buffer.append("Owe\u007fn");
+    }
+    ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(
+            TypeDescription.createString());
+    stats.increment();
+    stats.updateString(new Text(buffer.toString()));
+    StringColumnStatistics stringStats = (StringColumnStatistics) stats;
+
+    // make sure that the min/max are null
+    assertEquals(null, stringStats.getMinimum());
+    assertEquals(null, stringStats.getMaximum());
+    assertEquals(5 * 256, stringStats.getSum());
+
+    // and that the lower and upper bound are correct
+    assertEquals(buffer.substring(0, 1024), stringStats.getLowerBound());
+    assertEquals("Owe\u0080", stringStats.getUpperBound().substring(1020));
+    assertEquals("count: 1 hasNull: false lower: " + 
stringStats.getLowerBound()
+            + " upper: " + stringStats.getUpperBound() + " sum: 1280",
+        stringStats.toString());
+
+    // make sure that when we replace the min & max the flags get cleared.
+    stats.increment();
+    stats.updateString(new Text("xxx"));
+    assertEquals("xxx", stringStats.getMaximum());
+    assertEquals("xxx", stringStats.getUpperBound());
+    stats.increment();
+    stats.updateString(new Text("A"));
+    assertEquals("A", stringStats.getMinimum());
+    assertEquals("A", stringStats.getLowerBound());
+    assertEquals("count: 3 hasNull: false min: A max: xxx sum: 1284",
+        stats.toString());
+  }
+
+  /**
+   * Test truncation with 2 byte utf-8 characters.
+   */
+  @Test
+  public void testBoundsTwoByte() {
+    StringBuilder buffer = new StringBuilder();
+    final String PATTERN = "\u0080\u07ff\u0432\u0246\u0123";
+    for(int i=0; i < 256; ++i) {
+      buffer.append(PATTERN);
+    }
+    ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(
+        TypeDescription.createString());
+    stats.increment();
+    stats.updateString(new Text(buffer.toString()));
+    StringColumnStatistics stringStats = (StringColumnStatistics) stats;
+
+    // make sure that the min/max are null
+    assertEquals(null, stringStats.getMinimum());
+    assertEquals(null, stringStats.getMaximum());
+    assertEquals(2 * 5 * 256, stringStats.getSum());
+
+    // and that the lower and upper bound are correct
+    // 512 two byte characters fit in 1024 bytes
+    assertEquals(buffer.substring(0, 512), stringStats.getLowerBound());
+    assertEquals(buffer.substring(0, 511),
+        stringStats.getUpperBound().substring(0, 511));
+    assertEquals("\u0800", stringStats.getUpperBound().substring(511));
+  }
+
+  /**
+   * Test truncation with 3 byte utf-8 characters.
+   */
+  @Test
+  public void testBoundsThreeByte() {
+    StringBuilder buffer = new StringBuilder();
+    final String PATTERN = "\uffff\u0800\u4321\u1234\u3137";
+    for(int i=0; i < 256; ++i) {
+      buffer.append(PATTERN);
+    }
+    ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(
+        TypeDescription.createString());
+    stats.increment();
+    stats.updateString(new Text(buffer.toString()));
+    StringColumnStatistics stringStats = (StringColumnStatistics) stats;
+
+    // make sure that the min/max are null
+    assertEquals(null, stringStats.getMinimum());
+    assertEquals(null, stringStats.getMaximum());
+    assertEquals(3 * 5 * 256, stringStats.getSum());
+
+    // and that the lower and upper bound are correct
+    // 341 three byte characters fit in 1024 bytes
+    assertEquals(buffer.substring(0, 341), stringStats.getLowerBound());
+    assertEquals(buffer.substring(0, 340),
+        stringStats.getUpperBound().substring(0,340));
+    assertEquals("\ud800\udc00", stringStats.getUpperBound().substring(340));
+  }
+
+  /**
+   * Test truncation with 4 byte utf-8 characters.
+   */
+  @Test
+  public void testBoundsFourByte() {
+    StringBuilder buffer = new StringBuilder();
+    final String PATTERN = 
"\ud800\udc00\ud801\udc01\ud802\udc02\ud803\udc03\ud804\udc04";
+    for(int i=0; i < 256; ++i) {
+      buffer.append(PATTERN);
+    }
+    ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(
+        TypeDescription.createString());
+    stats.increment();
+    stats.updateString(new Text(buffer.toString()));
+    StringColumnStatistics stringStats = (StringColumnStatistics) stats;
+
+    // make sure that the min/max are null
+    assertEquals(null, stringStats.getMinimum());
+    assertEquals(null, stringStats.getMaximum());
+    assertEquals(4 * 5 * 256, stringStats.getSum());
+
+    // and that the lower and upper bound are correct
+    // 256 four byte characters fit in 1024 bytes
+    assertEquals(buffer.substring(0, 512), stringStats.getLowerBound());
+    assertEquals(buffer.substring(0, 510),
+        stringStats.getUpperBound().substring(0, 510));
+    assertEquals("\\uD800\\uDC01",
+        
StringEscapeUtils.escapeJava(stringStats.getUpperBound().substring(510)));
+  }
+
+  @Test
+  public void testUpperBoundCodepointIncrement() {
+    /* test with characters that use more than one byte */
+    final String fragment =  
"è¼è¨å¿åç°æ¢è¾æçºä½µéå²©ãå¤ç¾æ±çæ²æ§æä¹æåæ¸ç´¢ã"
+        + 
"å¯æä»¶æç¨å°äº¤æç¸ä¿®å®®ç±æ¹ä¾¡è¦ãä½å£ä¾å¹¾æ¥æ¬æ±ç¥éæ©ææ±åå·åä¸ç¯å¤ç¬¬åã"
+        + "ç®¡ä»æå³ç³è·å¸¸æ®æµ·å¶æè¦§æè³æãé£å 
å¤éµå¹´å¤ªé¡ä¼å¨é¢å¸å®³æç£ã"
+        + "ååè¼å½åçé æ«è¦å¾©æ¥è»å¿æãå
åçé¢æªæ³ä¼ä¼å£ä¸çå¹ç¹å¸³å¹çºé½è©±éã"
+        + "è¦ªç¦ææ 
åéæ³¨èªæå³¶æç´éåæ´¾ä¼éãå¹çµé¿åéé½ç´¹ç¥ç¦è¿½åæ¥ã"
+        + "æ ¹æ¡åè©±å°æ 
¼æ²»ä½ç¸æ©éå¸å¤éä½ãè©±ç¬¬åå¹³å½éè² äº¬è¤æ²æ¸å¤çã"
+        + "åå¹´ç¾¤è¾ºè»½å¦»æ¢åçæ¨©æçè¦è³ªå¨ç ´å¿ã"
+        + "à¤¨à¥à¤à¥ à¤®à¥à¤à¥à¤¤ à¤¬à¤¿à¤¨à¥à¤¦à¥à¤ 
à¤¸à¤®à¤¸à¥à¤¯à¤¾à¤ à¤à¤à¤¤à¤°à¤à¤¾à¤°à¥à¤¯à¤à¥à¤·à¤®à¤¤à¤¾ 
à¤¸à¥à¤¨à¤¾ à¤ªà¥à¤°à¤¤à¤¿ à¤¸à¤à¥à¤à¥à¤ à¤¯à¤¾à¤¯à¥à¤à¤¾ 
à¤¦à¤¿à¤¨à¤¾à¤à¤ à¤µà¤¾à¤¤à¤¾à¤µà¤°à¤£ ";
+
+    final String input = fragment
+            + "à¤®à¥à¤¶à¥à¤à¤¿à¤²à¥ à¤à¥à¤¨à¥à¤¦à¥à¤°à¤¿à¤¯ "
+            + "à¤²à¤à¤¤à¥ à¤¨à¤µà¤à¤¬à¤° à¤ªà¥à¤°à¤®à¤¾à¤¨ 
à¤à¤¯à¥à¤à¤¯à¤¾ à¤¸à¤®à¤¸à¥à¤¯à¤¾à¤ à¤µà¤¿à¤¶à¥à¤µ à¤²à¤¿à¤¯à¥ 
à¤¸à¤®à¤à¤¤à¥ à¤à¤ªà¤à¥ à¤à¤à¤¤à¥à¤°à¤¿à¤¤ 
à¤µà¤¿à¤à¥à¤¨à¥à¤¦à¥à¤°à¤¿à¤¤ à¤¸à¥à¤µà¤¤à¤à¤¤à¥à¤° "
+            + "à¤µà¥à¤¯à¤¾à¤à¥à¤¯à¤¾à¤¨ à¤à¥à¤¦à¤¨à¤à¥à¤·à¤®à¤¤à¤¾ 
à¤¶à¥à¤à¥à¤° à¤¹à¥à¤à¤° à¤®à¥à¤à¤¯ à¤à¤°à¤¤à¤¾à¥¤ à¤¦à¤°à¥à¤¶à¤¾à¤¤à¤¾ 
à¤µà¤¾à¤¤à¤¾à¤µà¤°à¤£ à¤µà¤¿à¤¸à¥à¤¤à¤°à¤£à¤à¥à¤·à¤®à¤¤à¤¾ 
à¤¦à¥à¤·à¤¸à¤à¥ à¤ªà¥à¤°à¤¾à¤ªà¥à¤¤ à¤¸à¤®à¤¾à¤à¥ "
+            + "à¥¤à¤ à¤¤à¤à¤¨à¥à¤à¥ à¤¦à¤°à¥à¤¶à¤¾à¤¤à¤¾ 
à¤à¤¾à¤°à¥à¤¯à¤à¤°à¥à¤¤à¤¾ à¤¬à¤¾à¤§à¤¾ à¤à¤·à¤§à¤¿à¤ 
à¤¸à¤®à¤¸à¥à¤¯à¤¾à¤ à¤¸à¤®à¤¸à¥à¤¯à¤¾à¤ à¤à¥à¤ªà¤¨à¥à¤¯à¤¤à¤¾ 
à¤ªà¥à¤°à¤¾à¤£ à¤ªà¤¸à¤à¤¦ "
+            + "à¤à¥à¤¯à¤¹ à¤¨à¤µà¤à¤¬à¤° à¤¦à¥à¤·à¤¸à¤à¥ à¤
à¤¨à¥à¤µà¤¾à¤¦à¤ à¤¸à¥à¤«à¤¼à¤¤à¤µà¥à¤° à¤¸à¤®à¤¸à¥à¤¯à¤¾à¤ 
à¤à¥à¤·à¤®à¤¤à¤¾à¥¤ à¤à¤¾à¤°à¥à¤¯ à¤¹à¥à¤à¤°\n";
+
+    final String lowerBound = fragment +
+        "à¤®à¥à¤¶à¥à¤à¤¿à¤²à¥ à¤à¥à¤¨à¥à¤¦à¥à¤°à¤¿à¤¯ à¤²à¤à¤¤à¥ 
à¤¨à¤µà¤à¤¬à¤° à¤ªà¥à¤°à¤®à¤¾à¤¨ à¤à¤¯à¥à¤à¤¯à¤¾ à¤¸à¤®à¤¸à¥à¤¯à¤¾à¤ 
à¤µà¤¿à¤¶à¥à¤µ à¤²à¤¿à¤¯à¥ ";
+
+    final String upperbound = fragment +
+        "à¤®à¥à¤¶à¥à¤à¤¿à¤²à¥ à¤à¥à¤¨à¥à¤¦à¥à¤°à¤¿à¤¯ à¤²à¤à¤¤à¥ 
à¤¨à¤µà¤à¤¬à¤° à¤ªà¥à¤°à¤®à¤¾à¤¨ à¤à¤¯à¥à¤à¤¯à¤¾ à¤¸à¤®à¤¸à¥à¤¯à¤¾à¤ 
à¤µà¤¿à¤¶à¥à¤µ à¤²à¤¿à¤¯à¥!";
+
+    final TypeDescription schema = TypeDescription.createString();
+    final ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+    byte[] utf8 = input.getBytes(StandardCharsets.UTF_8);
+    stats1.updateString(utf8, 0, utf8.length, 1);
+
+    final StringColumnStatistics typed = (StringColumnStatistics) stats1;
+
+    assertEquals(354, typed.getUpperBound().length());
+    assertEquals(354, typed.getLowerBound().length());
+
+    assertEquals(upperbound, typed.getUpperBound());
+    assertEquals(lowerBound, typed.getLowerBound());
+  }
+
+
+  @Test
   public void testDateMerge() throws Exception {
     TypeDescription schema = TypeDescription.createDate();
 

http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out 
b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
index 2a20a71..da79120 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with ORC_135
+File Version: 0.12 with ORC_203
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out 
b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
index c4fa8bf..4ec83bf 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with ORC_135
+File Version: 0.12 with ORC_203
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
----------------------------------------------------------------------
diff --git 
a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out 
b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
index 9b9dbef..14e9ac3 100644
--- a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
+++ b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with ORC_135
+File Version: 0.12 with ORC_203
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/tools/src/test/resources/orc-file-dump.json
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.json 
b/java/tools/src/test/resources/orc-file-dump.json
index 72476dd..91c1a2b 100644
--- a/java/tools/src/test/resources/orc-file-dump.json
+++ b/java/tools/src/test/resources/orc-file-dump.json
@@ -1,7 +1,7 @@
 {
   "fileName": "TestFileDump.testDump.orc",
   "fileVersion": "0.12",
-  "writerVersion": "ORC_135",
+  "writerVersion": "ORC_203",
   "numberOfRows": 21000,
   "compression": "ZLIB",
   "compressionBufferSize": 4096,

http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/tools/src/test/resources/orc-file-dump.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.out 
b/java/tools/src/test/resources/orc-file-dump.out
index 2ae99ce..d988155 100644
--- a/java/tools/src/test/resources/orc-file-dump.out
+++ b/java/tools/src/test/resources/orc-file-dump.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with ORC_135
+File Version: 0.12 with ORC_203
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/java/tools/src/test/resources/orc-file-has-null.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-has-null.out 
b/java/tools/src/test/resources/orc-file-has-null.out
index ed963dd..4fb7d69 100644
--- a/java/tools/src/test/resources/orc-file-has-null.out
+++ b/java/tools/src/test/resources/orc-file-has-null.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with ORC_135
+File Version: 0.12 with ORC_203
 Rows: 20000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/5ce07a14/proto/orc_proto.proto
----------------------------------------------------------------------
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index f92e531..e54427d 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -39,6 +39,10 @@ message StringStatistics {
   optional string maximum = 2;
   // sum will store the total length of all strings in a stripe
   optional sint64 sum = 3;
+  // If the minimum or maximum value was longer than 1024 bytes, store a lower 
or upper
+  // bound instead of the minimum or maximum values above.
+  optional string lowerBound = 4;
+  optional string upperBound = 5;
 }
 
 message BucketStatistics {

orc git commit: ORC-203: Trim StringStatistics to a maximum size of 1024 bytes.

Reply via email to