[ 
https://issues.apache.org/jira/browse/ORC-415?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16652755#comment-16652755
 ] 

ASF GitHub Bot commented on ORC-415:
------------------------------------

majetideepak closed pull request #319: ORC-415: [C++] Fix writing 
ColumnStatistics
URL: https://github.com/apache/orc/pull/319
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index eb2fc40de7..40f52a050b 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -280,10 +280,10 @@ namespace orc {
     }
 
     // update stats
-    bool hasNull = false;
     if (!structBatch->hasNulls) {
       colIndexStatistics->increase(numValues);
     } else {
+      bool hasNull = false;
       const char* notNull = structBatch->notNull.data() + offset;
       for (uint64_t i = 0; i < numValues; ++i) {
         if (notNull[i]) {
@@ -292,8 +292,10 @@ namespace orc {
           hasNull = true;
         }
       }
+      if (hasNull) {
+        colIndexStatistics->setHasNull(true);
+      }
     }
-    colIndexStatistics->setHasNull(hasNull);
   }
 
   void StructColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -461,7 +463,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    intStats->setHasNull(hasNull);
+    if (hasNull) {
+      intStats->setHasNull(true);
+    }
   }
 
   void IntegerColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -564,7 +568,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    intStats->setHasNull(hasNull);
+    if (hasNull) {
+      intStats->setHasNull(true);
+    }
   }
 
   void ByteColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -666,7 +672,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    boolStats->setHasNull(hasNull);
+    if (hasNull) {
+      boolStats->setHasNull(true);
+    }
   }
 
   void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -774,7 +782,6 @@ namespace orc {
     size_t bytes = isFloat ? 4 : 8;
     char* data = buffer.data();
     bool hasNull = false;
-
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         if (isFloat) {
@@ -790,7 +797,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    doubleStats->setHasNull(hasNull);
+    if (hasNull) {
+      doubleStats->setHasNull(true);
+    }
   }
 
   void DoubleColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -900,7 +909,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    strStats->setHasNull(hasNull);
+    if (hasNull) {
+      strStats->setHasNull(true);
+    }
   }
 
   void StringColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -980,8 +991,8 @@ namespace orc {
     if (strStats == nullptr) {
       throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
     }
-    bool hasNull = false;
 
+    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         char *charData = data[i];
@@ -1001,7 +1012,9 @@ namespace orc {
       }
     }
     lengthEncoder->add(length, numValues, notNull);
-    strStats->setHasNull(hasNull);
+    if (hasNull) {
+      strStats->setHasNull(true);
+    }
   }
 
   class VarCharColumnWriter : public StringColumnWriter {
@@ -1041,8 +1054,8 @@ namespace orc {
     if (strStats == nullptr) {
       throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
     }
-    bool hasNull = false;
 
+    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         if (length[i] > static_cast<int64_t>(maxLength)) {
@@ -1057,7 +1070,9 @@ namespace orc {
       }
     }
     lengthEncoder->add(length, numValues, notNull);
-    strStats->setHasNull(hasNull);
+    if (hasNull) {
+      strStats->setHasNull(true);
+    }
   }
 
   class BinaryColumnWriter : public StringColumnWriter {
@@ -1107,7 +1122,9 @@ namespace orc {
       }
     }
     lengthEncoder->add(length, numValues, notNull);
-    binStats->setHasNull(hasNull);
+    if (hasNull) {
+      binStats->setHasNull(true);
+    }
   }
 
   class TimestampColumnWriter : public ColumnWriter {
@@ -1222,7 +1239,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    tsStats->setHasNull(hasNull);
+    if (hasNull) {
+      tsStats->setHasNull(true);
+    }
 
     secRleEncoder->add(secs, numValues, notNull);
     nanoRleEncoder->add(nanos, numValues, notNull);
@@ -1314,7 +1333,9 @@ namespace orc {
         hasNull = true;
       }
     }
-    dateStats->setHasNull(hasNull);
+    if (hasNull) {
+      dateStats->setHasNull(true);
+    }
   }
 
   class Decimal64ColumnWriter : public ColumnWriter {
@@ -1391,8 +1412,8 @@ namespace orc {
     if (decStats == nullptr) {
       throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
     }
-    bool hasNull = false;
 
+    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         int64_t val = zigZag(values[i]);
@@ -1415,10 +1436,11 @@ namespace orc {
         hasNull = true;
       }
     }
+    if (hasNull) {
+      decStats->setHasNull(true);
+    }
     std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
     scaleEncoder->add(scales.data(), numValues, notNull);
-
-    decStats->setHasNull(hasNull);
   }
 
   void Decimal64ColumnWriter::flush(std::vector<proto::Stream>& streams) {
@@ -1511,10 +1533,10 @@ namespace orc {
     if (decStats == nullptr) {
       throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
     }
-    bool hasNull = false;
 
     // The current encoding of decimal columns stores the integer 
representation
     // of the value as an unbounded length zigzag encoded base 128 varint.
+    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         Int128 val = zigZagInt128(values[i]);
@@ -1536,10 +1558,11 @@ namespace orc {
         hasNull = true;
       }
     }
+    if (hasNull) {
+      decStats->setHasNull(true);
+    }
     std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
     scaleEncoder->add(scales.data(), numValues, notNull);
-
-    decStats->setHasNull(hasNull);
   }
 
   class ListColumnWriter : public ColumnWriter {
@@ -1639,10 +1662,10 @@ namespace orc {
     lengthEncoder->add(offsets, numValues, notNull);
 
     if (enableIndex) {
-      bool hasNull = false;
       if (!notNull) {
         colIndexStatistics->increase(numValues);
       } else {
+        bool hasNull = false;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
             colIndexStatistics->increase(1);
@@ -1650,8 +1673,10 @@ namespace orc {
             hasNull = true;
           }
         }
+        if (hasNull) {
+          colIndexStatistics->setHasNull(true);
+        }
       }
-      colIndexStatistics->setHasNull(hasNull);
     }
   }
 
@@ -1843,10 +1868,10 @@ namespace orc {
     }
 
     if (enableIndex) {
-      bool hasNull = false;
       if (!notNull) {
         colIndexStatistics->increase(numValues);
       } else {
+        bool hasNull = false;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
             colIndexStatistics->increase(1);
@@ -1854,8 +1879,10 @@ namespace orc {
             hasNull = true;
           }
         }
+        if (hasNull) {
+          colIndexStatistics->setHasNull(true);
+        }
       }
-      colIndexStatistics->setHasNull(hasNull);
     }
   }
 
@@ -2073,10 +2100,10 @@ namespace orc {
 
     // update stats
     if (enableIndex) {
-      bool hasNull = false;
       if (!notNull) {
         colIndexStatistics->increase(numValues);
       } else {
+        bool hasNull = false;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
             colIndexStatistics->increase(1);
@@ -2084,8 +2111,10 @@ namespace orc {
             hasNull = true;
           }
         }
+        if (hasNull) {
+          colIndexStatistics->setHasNull(true);
+        }
       }
-      colIndexStatistics->setHasNull(hasNull);
     }
   }
 
diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh
index 8122758473..d8cc7652ac 100644
--- a/c++/src/Statistics.hh
+++ b/c++/src/Statistics.hh
@@ -418,6 +418,8 @@ namespace orc {
       proto::BucketStatistics* bucketStats = 
pbStats.mutable_bucketstatistics();
       if (_hasCount) {
         bucketStats->add_count(_trueCount);
+      } else {
+        bucketStats->clear_count();
       }
     }
 
@@ -519,11 +521,14 @@ namespace orc {
       pbStats.set_hasnull(_stats.hasNull());
       pbStats.set_numberofvalues(_stats.getNumberOfValues());
 
+      proto::DateStatistics* dateStatistics =
+        pbStats.mutable_datestatistics();
       if (_stats.hasMinimum()) {
-        proto::DateStatistics* dateStatistics =
-          pbStats.mutable_datestatistics();
         dateStatistics->set_maximum(_stats.getMaximum());
         dateStatistics->set_minimum(_stats.getMinimum());
+      } else {
+        dateStatistics->clear_minimum();
+        dateStatistics->clear_maximum();
       }
     }
 
@@ -662,9 +667,14 @@ namespace orc {
       if (_stats.hasMinimum()) {
         decStats->set_minimum(_stats.getMinimum().toString());
         decStats->set_maximum(_stats.getMaximum().toString());
+      } else {
+        decStats->clear_minimum();
+        decStats->clear_maximum();
       }
       if (_stats.hasSum()) {
         decStats->set_sum(_stats.getSum().toString());
+      } else {
+        decStats->clear_sum();
       }
     }
 
@@ -836,9 +846,14 @@ namespace orc {
       if (_stats.hasMinimum()) {
         doubleStats->set_minimum(_stats.getMinimum());
         doubleStats->set_maximum(_stats.getMaximum());
+      } else {
+        doubleStats->clear_minimum();
+        doubleStats->clear_maximum();
       }
       if (_stats.hasSum()) {
         doubleStats->set_sum(_stats.getSum());
+      } else {
+        doubleStats->clear_sum();
       }
     }
 
@@ -980,9 +995,14 @@ namespace orc {
       if (_stats.hasMinimum()) {
         intStats->set_minimum(_stats.getMinimum());
         intStats->set_maximum(_stats.getMaximum());
+      } else {
+        intStats->clear_minimum();
+        intStats->clear_maximum();
       }
       if (_stats.hasSum()) {
         intStats->set_sum(_stats.getSum());
+      } else {
+        intStats->clear_sum();
       }
     }
 
@@ -1148,9 +1168,14 @@ namespace orc {
       if (_stats.hasMinimum()) {
         strStats->set_minimum(_stats.getMinimum());
         strStats->set_maximum(_stats.getMaximum());
+      } else {
+        strStats->clear_minimum();
+        strStats->clear_maximum();
       }
       if (_stats.hasTotalLength()) {
         strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
+      } else {
+        strStats->clear_sum();
       }
     }
 
@@ -1267,11 +1292,14 @@ namespace orc {
       pbStats.set_hasnull(_stats.hasNull());
       pbStats.set_numberofvalues(_stats.getNumberOfValues());
 
+      proto::TimestampStatistics* tsStats =
+        pbStats.mutable_timestampstatistics();
       if (_stats.hasMinimum()) {
-        proto::TimestampStatistics* tsStats =
-          pbStats.mutable_timestampstatistics();
         tsStats->set_minimumutc(_stats.getMinimum());
         tsStats->set_maximumutc(_stats.getMaximum());
+      } else {
+        tsStats->clear_minimumutc();
+        tsStats->clear_maximumutc();
       }
     }
 
diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc
index fee3318db7..621876927b 100644
--- a/c++/src/Writer.cc
+++ b/c++/src/Writer.cc
@@ -40,7 +40,7 @@ namespace orc {
     bool enableIndex;
 
     WriterOptionsPrivate() :
-                            fileVersion(FileVersion::v_0_12()) { // default to 
Hive_0_11
+                            fileVersion(FileVersion::v_0_12()) { // default to 
Hive_0_12
       stripeSize = 64 * 1024 * 1024; // 64M
       compressionBlockSize = 64 * 1024; // 64K
       rowIndexStride = 10000;


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [C++] Fix writing ColumnStatistics
> ----------------------------------
>
>                 Key: ORC-415
>                 URL: https://issues.apache.org/jira/browse/ORC-415
>             Project: ORC
>          Issue Type: Bug
>          Components: C++
>            Reporter: Gang Wu
>            Assignee: Gang Wu
>            Priority: Major
>
> Current C++ ORC writer implementation has two issues about column statistics.
> 1. A new batch may override previous batch's has_null info of 
> colIndexStatistics if the new batch has no null but the previous batch has at 
> least one null values.
> {code:java}
> bool hasNull = false;
> if (!structBatch->hasNulls) {
>   colIndexStatistics->increase(numValues);
> } else {
>   const char* notNull = structBatch->notNull.data() + offset;
>   for (uint64_t i = 0; i < numValues; ++i) {
>     if (notNull[i]) {
>       colIndexStatistics->increase(1);
>     } else if (!hasNull) {
>       hasNull = true;
>     }
>   }
> }
> colIndexStatistics->setHasNull(hasNull);{code}
> 2. If ColumnStatistics does not have any not-null data, it has no sum/min/max 
> infos and this results in writing generic but not type-specific 
> ColumnStatistics in the protobuf serialization. The problem is that reader 
> will have a hard time to deserialize the ColumnStatistics correctly.
> {code:java}
> void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
>   pbStats.set_hasnull(_stats.hasNull());
>   pbStats.set_numberofvalues(_stats.getNumberOfValues());
>   if (_stats.hasMinimum()) {
>     proto::DateStatistics* dateStatistics = pbStats.mutable_datestatistics();
>     dateStatistics->set_maximum(_stats.getMaximum());
>     dateStatistics->set_minimum(_stats.getMinimum());
>   }
> }
> {code}
>  
> The scope of this Jira is to fix these two problems.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to