This is an automated email from the ASF dual-hosted git repository.

mdeepak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/master by this push:
     new f67b22f  ORC-453:[C++] Improve Statistics->setHasNull(true) code.
f67b22f is described below

commit f67b22fd8c6f9ce25afb5dcef76c2467e4c6f19b
Author: rip-nsk <[email protected]>
AuthorDate: Tue Dec 25 17:23:33 2018 -0800

    ORC-453:[C++] Improve Statistics->setHasNull(true) code.
    
    Fixes #352
    
    Signed-off-by: Deepak Majeti <[email protected]>
---
 c++/src/ColumnWriter.cc | 80 ++++++++++---------------------------------------
 1 file changed, 16 insertions(+), 64 deletions(-)

diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index ef31c45..a89b9c5 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -290,17 +290,14 @@ namespace orc {
       colIndexStatistics->increase(numValues);
     } else {
       uint64_t count = 0;
-      bool hasNull = false;
       const char* notNull = structBatch->notNull.data() + offset;
       for (uint64_t i = 0; i < numValues; ++i) {
         if (notNull[i]) {
           ++count;
-        } else if (!hasNull) {
-          hasNull = true;
         }
       }
       colIndexStatistics->increase(count);
-      if (hasNull) {
+      if (count < numValues) {
         colIndexStatistics->setHasNull(true);
       }
     }
@@ -468,17 +465,14 @@ namespace orc {
 
     // update stats
     uint64_t count = 0;
-    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (notNull == nullptr || notNull[i]) {
         ++count;
         intStats->update(data[i], 1);
-      } else if (!hasNull) {
-        hasNull = true;
       }
     }
     intStats->increase(count);
-    if (hasNull) {
+    if (count < numValues) {
       intStats->setHasNull(true);
     }
   }
@@ -575,17 +569,14 @@ namespace orc {
     byteRleEncoder->add(byteData, numValues, notNull);
 
     uint64_t count = 0;
-    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (notNull == nullptr || notNull[i]) {
         ++count;
         intStats->update(static_cast<int64_t>(byteData[i]), 1);
-      } else if (!hasNull) {
-        hasNull = true;
       }
     }
     intStats->increase(count);
-    if (hasNull) {
+    if (count < numValues) {
       intStats->setHasNull(true);
     }
   }
@@ -682,17 +673,14 @@ namespace orc {
     rleEncoder->add(byteData, numValues, notNull);
 
     uint64_t count = 0;
-    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (notNull == nullptr || notNull[i]) {
         ++count;
         boolStats->update(byteData[i] != 0, 1);
-      } else if (!hasNull) {
-        hasNull = true;
       }
     }
     boolStats->increase(count);
-    if (hasNull) {
+    if (count < numValues) {
       boolStats->setHasNull(true);
     }
   }
@@ -802,7 +790,6 @@ namespace orc {
     size_t bytes = isFloat ? 4 : 8;
     char* data = buffer.data();
     uint64_t count = 0;
-    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         if (isFloat) {
@@ -813,12 +800,10 @@ namespace orc {
         dataStream->write(data, bytes);
         ++count;
         doubleStats->update(doubleData[i]);
-      } else if (!hasNull) {
-        hasNull = true;
       }
     }
     doubleStats->increase(count);
-    if (hasNull) {
+    if (count < numValues) {
       doubleStats->setHasNull(true);
     }
   }
@@ -1104,7 +1089,6 @@ namespace orc {
     }
 
     uint64_t count = 0;
-    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         const size_t len = static_cast<size_t>(length[i]);
@@ -1116,12 +1100,10 @@ namespace orc {
         }
         strStats->update(data[i], len);
         ++count;
-      } else if (!hasNull) {
-        hasNull = true;
       }
     }
     strStats->increase(count);
-    if (hasNull) {
+    if (count < numValues) {
       strStats->setHasNull(true);
     }
   }
@@ -1465,7 +1447,6 @@ namespace orc {
                           charsBatch->notNull.data() + offset : nullptr;
 
     uint64_t count = 0;
-    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         const char * charData = nullptr;
@@ -1494,8 +1475,6 @@ namespace orc {
 
         strStats->update(charData, static_cast<size_t>(length[i]));
         ++count;
-      } else if (!hasNull) {
-        hasNull = true;
       }
     }
 
@@ -1504,7 +1483,7 @@ namespace orc {
     }
 
     strStats->increase(count);
-    if (hasNull) {
+    if (count < numValues) {
       strStats->setHasNull(true);
     }
   }
@@ -1549,7 +1528,6 @@ namespace orc {
                           charsBatch->notNull.data() + offset : nullptr;
 
     uint64_t count = 0;
-    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         uint64_t itemLength = Utf8Utils::truncateBytesTo(
@@ -1565,8 +1543,6 @@ namespace orc {
 
         strStats->update(data[i], static_cast<size_t>(length[i]));
         ++count;
-      } else if (!hasNull) {
-        hasNull = true;
       }
     }
 
@@ -1575,7 +1551,7 @@ namespace orc {
     }
 
     strStats->increase(count);
-    if (hasNull) {
+    if (count < numValues) {
       strStats->setHasNull(true);
     }
   }
@@ -1616,7 +1592,6 @@ namespace orc {
                           binBatch->notNull.data() + offset : nullptr;
 
     uint64_t count = 0;
-    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       uint64_t unsignedLength = static_cast<uint64_t>(length[i]);
       if (!notNull || notNull[i]) {
@@ -1624,13 +1599,11 @@ namespace orc {
 
         binStats->update(unsignedLength);
         ++count;
-      } else if (!hasNull) {
-        hasNull = true;
       }
     }
     directLengthEncoder->add(length, numValues, notNull);
     binStats->increase(count);
-    if (hasNull) {
+    if (count < numValues) {
       binStats->setHasNull(true);
     }
   }
@@ -1732,7 +1705,6 @@ namespace orc {
     int64_t *nanos = tsBatch->nanoseconds.data() + offset;
 
     uint64_t count = 0;
-    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (notNull == nullptr || notNull[i]) {
         // TimestampVectorBatch already stores data in UTC
@@ -1746,12 +1718,10 @@ namespace orc {
 
         secs[i] -= timezone.getEpoch();
         nanos[i] = formatNano(nanos[i]);
-      } else if (!hasNull) {
-        hasNull = true;
       }
     }
     tsStats->increase(count);
-    if (hasNull) {
+    if (count < numValues) {
       tsStats->setHasNull(true);
     }
 
@@ -1839,17 +1809,14 @@ namespace orc {
     rleEncoder->add(data, numValues, notNull);
 
     uint64_t count = 0;
-    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         ++count;
         dateStats->update(static_cast<int32_t>(data[i]));
-      } else if (!hasNull) {
-        hasNull = true;
       }
     }
     dateStats->increase(count);
-    if (hasNull) {
+    if (count < numValues) {
       dateStats->setHasNull(true);
     }
   }
@@ -1932,7 +1899,6 @@ namespace orc {
     const int64_t* values = decBatch->values.data() + offset;
 
     uint64_t count = 0;
-    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         int64_t val = zigZag(values[i]);
@@ -1950,12 +1916,10 @@ namespace orc {
         valueStream->write(buffer, static_cast<size_t>(data - buffer));
         ++count;
         decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
-      } else if (!hasNull) {
-        hasNull = true;
       }
     }
     decStats->increase(count);
-    if (hasNull) {
+    if (count < numValues) {
       decStats->setHasNull(true);
     }
     std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
@@ -2058,7 +2022,6 @@ namespace orc {
     // The current encoding of decimal columns stores the integer 
representation
     // of the value as an unbounded length zigzag encoded base 128 varint.
     uint64_t count = 0;
-    bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         Int128 val = zigZagInt128(values[i]);
@@ -2076,12 +2039,10 @@ namespace orc {
 
         ++count;
         decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
-      } else if (!hasNull) {
-        hasNull = true;
       }
     }
     decStats->increase(count);
-    if (hasNull) {
+    if (count < numValues) {
       decStats->setHasNull(true);
     }
     std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
@@ -2193,16 +2154,13 @@ namespace orc {
         colIndexStatistics->increase(numValues);
       } else {
         uint64_t count = 0;
-        bool hasNull = false;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
             ++count;
-          } else if (!hasNull) {
-            hasNull = true;
           }
         }
         colIndexStatistics->increase(count);
-        if (hasNull) {
+        if (count < numValues) {
           colIndexStatistics->setHasNull(true);
         }
       }
@@ -2418,16 +2376,13 @@ namespace orc {
         colIndexStatistics->increase(numValues);
       } else {
         uint64_t count = 0;
-        bool hasNull = false;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
             ++count;
-          } else if (!hasNull) {
-            hasNull = true;
           }
         }
         colIndexStatistics->increase(count);
-        if (hasNull) {
+        if (count < numValues) {
           colIndexStatistics->setHasNull(true);
         }
       }
@@ -2675,16 +2630,13 @@ namespace orc {
         colIndexStatistics->increase(numValues);
       } else {
         uint64_t count = 0;
-        bool hasNull = false;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
             ++count;
-          } else if (!hasNull) {
-            hasNull = true;
           }
         }
         colIndexStatistics->increase(count);
-        if (hasNull) {
+        if (count < numValues) {
           colIndexStatistics->setHasNull(true);
         }
       }

Reply via email to