This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/master by this push:
     new 3f0ee8d  ORC-443: [C++] Code improvements in ColumnWriter.
3f0ee8d is described below

commit 3f0ee8d1084e413de0727208651e47dc3f11821c
Author: Fang Zheng <[email protected]>
AuthorDate: Mon Dec 3 10:40:35 2018 -0800

    ORC-443: [C++] Code improvements in ColumnWriter.
    
    Fixes #344
    
    Signed-off-by: Gang Wu <[email protected]>
---
 c++/src/ByteRLE.cc      |   8 +-
 c++/src/ColumnWriter.cc | 244 ++++++++++++++++++++++++++++--------------------
 2 files changed, 148 insertions(+), 104 deletions(-)

diff --git a/c++/src/ByteRLE.cc b/c++/src/ByteRLE.cc
index 2664019..4bf8b7a 100644
--- a/c++/src/ByteRLE.cc
+++ b/c++/src/ByteRLE.cc
@@ -40,7 +40,7 @@ namespace orc {
     virtual ~ByteRleEncoderImpl() override;
 
     /**
-     * Encode the next batch of values
+     * Encode the next batch of values.
      * @param data to be encoded
      * @param numValues the number of values to be encoded
      * @param notNull If the pointer is null, all values are read. If the
@@ -55,8 +55,8 @@ namespace orc {
     virtual uint64_t getBufferSize() const override;
 
     /**
-     * Flushing underlying BufferedOutputStream
-    */
+     * Flush underlying BufferedOutputStream.
+     */
     virtual uint64_t flush() override;
 
     virtual void recordPosition(PositionRecorder* recorder) const override;
@@ -122,7 +122,7 @@ namespace orc {
         writeByte(
             static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT)));
         writeByte(literals[0]);
-     } else {
+      } else {
         writeByte(static_cast<char>(-numLiterals));
         for (int i = 0; i < numLiterals; ++i) {
           writeByte(literals[i]);
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index c6c30f3..ef31c45 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -274,13 +274,13 @@ namespace orc {
                                ColumnVectorBatch& rowBatch,
                                uint64_t offset,
                                uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
     const StructVectorBatch* structBatch =
       dynamic_cast<const StructVectorBatch *>(&rowBatch);
     if (structBatch == nullptr) {
       throw InvalidArgument("Failed to cast to StructVectorBatch");
     }
 
+    ColumnWriter::add(rowBatch, offset, numValues);
     for (uint32_t i = 0; i < children.size(); ++i) {
       children[i]->add(*structBatch->fields[i], offset, numValues);
     }
@@ -289,15 +289,17 @@ namespace orc {
     if (!structBatch->hasNulls) {
       colIndexStatistics->increase(numValues);
     } else {
+      uint64_t count = 0;
       bool hasNull = false;
       const char* notNull = structBatch->notNull.data() + offset;
       for (uint64_t i = 0; i < numValues; ++i) {
         if (notNull[i]) {
-          colIndexStatistics->increase(1);
+          ++count;
         } else if (!hasNull) {
           hasNull = true;
         }
       }
+      colIndexStatistics->increase(count);
       if (hasNull) {
         colIndexStatistics->setHasNull(true);
       }
@@ -445,13 +447,18 @@ namespace orc {
                                 ColumnVectorBatch& rowBatch,
                                 uint64_t offset,
                                 uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
-
     const LongVectorBatch* longBatch =
       dynamic_cast<const LongVectorBatch*>(&rowBatch);
     if (longBatch == nullptr) {
       throw InvalidArgument("Failed to cast to LongVectorBatch");
     }
+    IntegerColumnStatisticsImpl* intStats =
+        dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
+    if (intStats == nullptr) {
+      throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl");
+    }
+
+    ColumnWriter::add(rowBatch, offset, numValues);
 
     const int64_t* data = longBatch->data.data() + offset;
     const char* notNull = longBatch->hasNulls ?
@@ -460,21 +467,17 @@ namespace orc {
     rleEncoder->add(data, numValues, notNull);
 
     // update stats
-    IntegerColumnStatisticsImpl* intStats =
-      dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
-    if (intStats == nullptr) {
-      throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl");
-    }
-
+    uint64_t count = 0;
     bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (notNull == nullptr || notNull[i]) {
-        intStats->increase(1);
+        ++count;
         intStats->update(data[i], 1);
       } else if (!hasNull) {
         hasNull = true;
       }
     }
+    intStats->increase(count);
     if (hasNull) {
       intStats->setHasNull(true);
     }
@@ -549,12 +552,17 @@ namespace orc {
   void ByteColumnWriter::add(ColumnVectorBatch& rowBatch,
                              uint64_t offset,
                              uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
-
     LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
     if (byteBatch == nullptr) {
       throw InvalidArgument("Failed to cast to LongVectorBatch");
     }
+    IntegerColumnStatisticsImpl* intStats =
+        dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
+    if (intStats == nullptr) {
+      throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl");
+    }
+
+    ColumnWriter::add(rowBatch, offset, numValues);
 
     int64_t* data = byteBatch->data.data() + offset;
     const char* notNull = byteBatch->hasNulls ?
@@ -566,20 +574,17 @@ namespace orc {
     }
     byteRleEncoder->add(byteData, numValues, notNull);
 
-    IntegerColumnStatisticsImpl* intStats =
-        dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
-    if (intStats == nullptr) {
-      throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl");
-    }
+    uint64_t count = 0;
     bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (notNull == nullptr || notNull[i]) {
-        intStats->increase(1);
+        ++count;
         intStats->update(static_cast<int64_t>(byteData[i]), 1);
       } else if (!hasNull) {
         hasNull = true;
       }
     }
+    intStats->increase(count);
     if (hasNull) {
       intStats->setHasNull(true);
     }
@@ -654,12 +659,18 @@ namespace orc {
   void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch,
                                 uint64_t offset,
                                 uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
-
     LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
     if (byteBatch == nullptr) {
       throw InvalidArgument("Failed to cast to LongVectorBatch");
     }
+    BooleanColumnStatisticsImpl* boolStats =
+        dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get());
+    if (boolStats == nullptr) {
+      throw InvalidArgument("Failed to cast to BooleanColumnStatisticsImpl");
+    }
+
+    ColumnWriter::add(rowBatch, offset, numValues);
+
     int64_t* data = byteBatch->data.data() + offset;
     const char* notNull = byteBatch->hasNulls ?
                           byteBatch->notNull.data() + offset : nullptr;
@@ -670,20 +681,17 @@ namespace orc {
     }
     rleEncoder->add(byteData, numValues, notNull);
 
-    BooleanColumnStatisticsImpl* boolStats =
-        dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get());
-    if (boolStats == nullptr) {
-      throw InvalidArgument("Failed to cast to BooleanColumnStatisticsImpl");
-    }
+    uint64_t count = 0;
     bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (notNull == nullptr || notNull[i]) {
-        boolStats->increase(1);
+        ++count;
         boolStats->update(byteData[i] != 0, 1);
       } else if (!hasNull) {
         hasNull = true;
       }
     }
+    boolStats->increase(count);
     if (hasNull) {
       boolStats->setHasNull(true);
     }
@@ -774,25 +782,26 @@ namespace orc {
   void DoubleColumnWriter::add(ColumnVectorBatch& rowBatch,
                                uint64_t offset,
                                uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
     const DoubleVectorBatch* dblBatch =
       dynamic_cast<const DoubleVectorBatch*>(&rowBatch);
     if (dblBatch == nullptr) {
       throw InvalidArgument("Failed to cast to DoubleVectorBatch");
     }
-
-    const double* doubleData = dblBatch->data.data() + offset;
-    const char* notNull = dblBatch->hasNulls ?
-                          dblBatch->notNull.data() + offset : nullptr;
-
     DoubleColumnStatisticsImpl* doubleStats =
       dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get());
     if (doubleStats == nullptr) {
       throw InvalidArgument("Failed to cast to DoubleColumnStatisticsImpl");
     }
 
+    ColumnWriter::add(rowBatch, offset, numValues);
+
+    const double* doubleData = dblBatch->data.data() + offset;
+    const char* notNull = dblBatch->hasNulls ?
+                          dblBatch->notNull.data() + offset : nullptr;
+
     size_t bytes = isFloat ? 4 : 8;
     char* data = buffer.data();
+    uint64_t count = 0;
     bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
@@ -802,13 +811,13 @@ namespace orc {
           encodeFloatNum<double, int64_t>(doubleData[i], data);
         }
         dataStream->write(data, bytes);
-
-        doubleStats->increase(1);
+        ++count;
         doubleStats->update(doubleData[i]);
       } else if (!hasNull) {
         hasNull = true;
       }
     }
+    doubleStats->increase(count);
     if (hasNull) {
       doubleStats->setHasNull(true);
     }
@@ -1071,13 +1080,20 @@ namespace orc {
   void StringColumnWriter::add(ColumnVectorBatch& rowBatch,
                                uint64_t offset,
                                uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
     const StringVectorBatch* stringBatch =
       dynamic_cast<const StringVectorBatch*>(&rowBatch);
     if (stringBatch == nullptr) {
       throw InvalidArgument("Failed to cast to StringVectorBatch");
     }
 
+    StringColumnStatisticsImpl* strStats =
+        dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
+    if (strStats == nullptr) {
+      throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
+    }
+
+    ColumnWriter::add(rowBatch, offset, numValues);
+
     char *const * data = stringBatch->data.data() + offset;
     const int64_t* length = stringBatch->length.data() + offset;
     const char* notNull = stringBatch->hasNulls ?
@@ -1087,26 +1103,24 @@ namespace orc {
       directLengthEncoder->add(length, numValues, notNull);
     }
 
-    StringColumnStatisticsImpl* strStats =
-      dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
-    if (strStats == nullptr) {
-      throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
-    }
+    uint64_t count = 0;
     bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
+        const size_t len = static_cast<size_t>(length[i]);
         if (useDictionary) {
-          size_t index = dictionary.insert(data[i], 
static_cast<size_t>(length[i]));
+          size_t index = dictionary.insert(data[i], len);
           dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index));
         } else {
-          directDataStream->write(data[i], static_cast<size_t>(length[i]));
+          directDataStream->write(data[i], len);
         }
-        strStats->update(data[i], static_cast<size_t>(length[i]));
-        strStats->increase(1);
+        strStats->update(data[i], len);
+        ++count;
       } else if (!hasNull) {
         hasNull = true;
       }
     }
+    strStats->increase(count);
     if (hasNull) {
       strStats->setHasNull(true);
     }
@@ -1432,23 +1446,25 @@ namespace orc {
   void CharColumnWriter::add(ColumnVectorBatch& rowBatch,
                              uint64_t offset,
                              uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
     StringVectorBatch* charsBatch = 
dynamic_cast<StringVectorBatch*>(&rowBatch);
     if (charsBatch == nullptr) {
       throw InvalidArgument("Failed to cast to StringVectorBatch");
     }
 
-    char** data = charsBatch->data.data() + offset;
-    int64_t* length = charsBatch->length.data() + offset;
-    const char* notNull = charsBatch->hasNulls ?
-                          charsBatch->notNull.data() + offset : nullptr;
-
     StringColumnStatisticsImpl* strStats =
         dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
     if (strStats == nullptr) {
       throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
     }
 
+    ColumnWriter::add(rowBatch, offset, numValues);
+
+    char** data = charsBatch->data.data() + offset;
+    int64_t* length = charsBatch->length.data() + offset;
+    const char* notNull = charsBatch->hasNulls ?
+                          charsBatch->notNull.data() + offset : nullptr;
+
+    uint64_t count = 0;
     bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
@@ -1477,7 +1493,7 @@ namespace orc {
         }
 
         strStats->update(charData, static_cast<size_t>(length[i]));
-        strStats->increase(1);
+        ++count;
       } else if (!hasNull) {
         hasNull = true;
       }
@@ -1487,6 +1503,7 @@ namespace orc {
       directLengthEncoder->add(length, numValues, notNull);
     }
 
+    strStats->increase(count);
     if (hasNull) {
       strStats->setHasNull(true);
     }
@@ -1513,23 +1530,25 @@ namespace orc {
   void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch,
                                 uint64_t offset,
                                 uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
     StringVectorBatch* charsBatch = 
dynamic_cast<StringVectorBatch*>(&rowBatch);
     if (charsBatch == nullptr) {
       throw InvalidArgument("Failed to cast to StringVectorBatch");
     }
 
-    char* const* data = charsBatch->data.data() + offset;
-    int64_t* length = charsBatch->length.data() + offset;
-    const char* notNull = charsBatch->hasNulls ?
-                          charsBatch->notNull.data() + offset : nullptr;
-
     StringColumnStatisticsImpl* strStats =
         dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
     if (strStats == nullptr) {
       throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
     }
 
+    ColumnWriter::add(rowBatch, offset, numValues);
+
+    char* const* data = charsBatch->data.data() + offset;
+    int64_t* length = charsBatch->length.data() + offset;
+    const char* notNull = charsBatch->hasNulls ?
+                          charsBatch->notNull.data() + offset : nullptr;
+
+    uint64_t count = 0;
     bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
@@ -1545,7 +1564,7 @@ namespace orc {
         }
 
         strStats->update(data[i], static_cast<size_t>(length[i]));
-        strStats->increase(1);
+        ++count;
       } else if (!hasNull) {
         hasNull = true;
       }
@@ -1555,6 +1574,7 @@ namespace orc {
       directLengthEncoder->add(length, numValues, notNull);
     }
 
+    strStats->increase(count);
     if (hasNull) {
       strStats->setHasNull(true);
     }
@@ -1577,16 +1597,10 @@ namespace orc {
   void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch,
                                uint64_t offset,
                                uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
-
     StringVectorBatch* binBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
     if (binBatch == nullptr) {
       throw InvalidArgument("Failed to cast to StringVectorBatch");
     }
-    char** data = binBatch->data.data() + offset;
-    int64_t* length = binBatch->length.data() + offset;
-    const char* notNull = binBatch->hasNulls ?
-                          binBatch->notNull.data() + offset : nullptr;
 
     BinaryColumnStatisticsImpl* binStats =
         dynamic_cast<BinaryColumnStatisticsImpl*>(colIndexStatistics.get());
@@ -1594,6 +1608,14 @@ namespace orc {
       throw InvalidArgument("Failed to cast to BinaryColumnStatisticsImpl");
     }
 
+    ColumnWriter::add(rowBatch, offset, numValues);
+
+    char** data = binBatch->data.data() + offset;
+    int64_t* length = binBatch->length.data() + offset;
+    const char* notNull = binBatch->hasNulls ?
+                          binBatch->notNull.data() + offset : nullptr;
+
+    uint64_t count = 0;
     bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       uint64_t unsignedLength = static_cast<uint64_t>(length[i]);
@@ -1601,12 +1623,13 @@ namespace orc {
         directDataStream->write(data[i], unsignedLength);
 
         binStats->update(unsignedLength);
-        binStats->increase(1);
+        ++count;
       } else if (!hasNull) {
         hasNull = true;
       }
     }
     directLengthEncoder->add(length, numValues, notNull);
+    binStats->increase(count);
     if (hasNull) {
       binStats->setHasNull(true);
     }
@@ -1689,29 +1712,32 @@ namespace orc {
   void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch,
                                   uint64_t offset,
                                   uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
     TimestampVectorBatch* tsBatch =
       dynamic_cast<TimestampVectorBatch*>(&rowBatch);
     if (tsBatch == nullptr) {
       throw InvalidArgument("Failed to cast to TimestampVectorBatch");
     }
 
-    const char* notNull = tsBatch->hasNulls ?
-                          tsBatch->notNull.data() + offset : nullptr;
-    int64_t *secs = tsBatch->data.data() + offset;
-    int64_t *nanos = tsBatch->nanoseconds.data() + offset;
-
     TimestampColumnStatisticsImpl* tsStats =
         dynamic_cast<TimestampColumnStatisticsImpl*>(colIndexStatistics.get());
     if (tsStats == nullptr) {
       throw InvalidArgument("Failed to cast to TimestampColumnStatisticsImpl");
     }
+
+    ColumnWriter::add(rowBatch, offset, numValues);
+
+    const char* notNull = tsBatch->hasNulls ?
+                          tsBatch->notNull.data() + offset : nullptr;
+    int64_t *secs = tsBatch->data.data() + offset;
+    int64_t *nanos = tsBatch->nanoseconds.data() + offset;
+
+    uint64_t count = 0;
     bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (notNull == nullptr || notNull[i]) {
         // TimestampVectorBatch already stores data in UTC
         int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000;
-        tsStats->increase(1);
+        ++count;
         tsStats->update(millsUTC);
 
         if (secs[i] < 0 && nanos[i] != 0) {
@@ -1724,6 +1750,7 @@ namespace orc {
         hasNull = true;
       }
     }
+    tsStats->increase(count);
     if (hasNull) {
       tsStats->setHasNull(true);
     }
@@ -1791,33 +1818,37 @@ namespace orc {
   void DateColumnWriter::add(ColumnVectorBatch& rowBatch,
                              uint64_t offset,
                              uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
     const LongVectorBatch* longBatch =
       dynamic_cast<const LongVectorBatch*>(&rowBatch);
     if (longBatch == nullptr) {
       throw InvalidArgument("Failed to cast to LongVectorBatch");
     }
 
+    DateColumnStatisticsImpl* dateStats =
+        dynamic_cast<DateColumnStatisticsImpl*>(colIndexStatistics.get());
+    if (dateStats == nullptr) {
+      throw InvalidArgument("Failed to cast to DateColumnStatisticsImpl");
+    }
+
+    ColumnWriter::add(rowBatch, offset, numValues);
+
     const int64_t* data = longBatch->data.data() + offset;
     const char* notNull = longBatch->hasNulls ?
                           longBatch->notNull.data() + offset : nullptr;
 
     rleEncoder->add(data, numValues, notNull);
 
-    DateColumnStatisticsImpl* dateStats =
-      dynamic_cast<DateColumnStatisticsImpl*>(colIndexStatistics.get());
-    if (dateStats == nullptr) {
-      throw InvalidArgument("Failed to cast to DateColumnStatisticsImpl");
-    }
+    uint64_t count = 0;
     bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
-        dateStats->increase(1);
+        ++count;
         dateStats->update(static_cast<int32_t>(data[i]));
       } else if (!hasNull) {
         hasNull = true;
       }
     }
+    dateStats->increase(count);
     if (hasNull) {
       dateStats->setHasNull(true);
     }
@@ -1882,22 +1913,25 @@ namespace orc {
   void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch,
                                   uint64_t offset,
                                   uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
     const Decimal64VectorBatch* decBatch =
       dynamic_cast<const Decimal64VectorBatch*>(&rowBatch);
     if (decBatch == nullptr) {
       throw InvalidArgument("Failed to cast to Decimal64VectorBatch");
     }
 
-    const char* notNull = decBatch->hasNulls ?
-                          decBatch->notNull.data() + offset : nullptr;
-    const int64_t* values = decBatch->values.data() + offset;
     DecimalColumnStatisticsImpl* decStats =
       dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
     if (decStats == nullptr) {
       throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
     }
 
+    ColumnWriter::add(rowBatch, offset, numValues);
+
+    const char* notNull = decBatch->hasNulls ?
+                          decBatch->notNull.data() + offset : nullptr;
+    const int64_t* values = decBatch->values.data() + offset;
+
+    uint64_t count = 0;
     bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
@@ -1914,13 +1948,13 @@ namespace orc {
           }
         }
         valueStream->write(buffer, static_cast<size_t>(data - buffer));
-
-        decStats->increase(1);
+        ++count;
         decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
       } else if (!hasNull) {
         hasNull = true;
       }
     }
+    decStats->increase(count);
     if (hasNull) {
       decStats->setHasNull(true);
     }
@@ -2003,24 +2037,27 @@ namespace orc {
   void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch,
                                    uint64_t offset,
                                    uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
     const Decimal128VectorBatch* decBatch =
       dynamic_cast<const Decimal128VectorBatch*>(&rowBatch);
     if (decBatch == nullptr) {
       throw InvalidArgument("Failed to cast to Decimal128VectorBatch");
     }
 
-    const char* notNull = decBatch->hasNulls ?
-                          decBatch->notNull.data() + offset : nullptr;
-    const Int128* values = decBatch->values.data() + offset;
     DecimalColumnStatisticsImpl* decStats =
       dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
     if (decStats == nullptr) {
       throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
     }
 
+    ColumnWriter::add(rowBatch, offset, numValues);
+
+    const char* notNull = decBatch->hasNulls ?
+                          decBatch->notNull.data() + offset : nullptr;
+    const Int128* values = decBatch->values.data() + offset;
+
     // The current encoding of decimal columns stores the integer 
representation
     // of the value as an unbounded length zigzag encoded base 128 varint.
+    uint64_t count = 0;
     bool hasNull = false;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
@@ -2037,12 +2074,13 @@ namespace orc {
         }
         valueStream->write(buffer, static_cast<size_t>(data - buffer));
 
-        decStats->increase(1);
+        ++count;
         decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
       } else if (!hasNull) {
         hasNull = true;
       }
     }
+    decStats->increase(count);
     if (hasNull) {
       decStats->setHasNull(true);
     }
@@ -2125,13 +2163,13 @@ namespace orc {
   void ListColumnWriter::add(ColumnVectorBatch& rowBatch,
                              uint64_t offset,
                              uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
-
     ListVectorBatch* listBatch = dynamic_cast<ListVectorBatch*>(&rowBatch);
     if (listBatch == nullptr) {
       throw InvalidArgument("Failed to cast to ListVectorBatch");
     }
 
+    ColumnWriter::add(rowBatch, offset, numValues);
+
     int64_t* offsets = listBatch->offsets.data() + offset;
     const char* notNull = listBatch->hasNulls ?
                           listBatch->notNull.data() + offset : nullptr;
@@ -2154,14 +2192,16 @@ namespace orc {
       if (!notNull) {
         colIndexStatistics->increase(numValues);
       } else {
+        uint64_t count = 0;
         bool hasNull = false;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
-            colIndexStatistics->increase(1);
+            ++count;
           } else if (!hasNull) {
             hasNull = true;
           }
         }
+        colIndexStatistics->increase(count);
         if (hasNull) {
           colIndexStatistics->setHasNull(true);
         }
@@ -2344,13 +2384,13 @@ namespace orc {
   void MapColumnWriter::add(ColumnVectorBatch& rowBatch,
                             uint64_t offset,
                             uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
-
     MapVectorBatch* mapBatch = dynamic_cast<MapVectorBatch*>(&rowBatch);
     if (mapBatch == nullptr) {
       throw InvalidArgument("Failed to cast to MapVectorBatch");
     }
 
+    ColumnWriter::add(rowBatch, offset, numValues);
+
     int64_t* offsets = mapBatch->offsets.data() + offset;
     const char* notNull = mapBatch->hasNulls ?
                           mapBatch->notNull.data() + offset : nullptr;
@@ -2377,14 +2417,16 @@ namespace orc {
       if (!notNull) {
         colIndexStatistics->increase(numValues);
       } else {
+        uint64_t count = 0;
         bool hasNull = false;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
-            colIndexStatistics->increase(1);
+            ++count;
           } else if (!hasNull) {
             hasNull = true;
           }
         }
+        colIndexStatistics->increase(count);
         if (hasNull) {
           colIndexStatistics->setHasNull(true);
         }
@@ -2595,13 +2637,13 @@ namespace orc {
   void UnionColumnWriter::add(ColumnVectorBatch& rowBatch,
                               uint64_t offset,
                               uint64_t numValues) {
-    ColumnWriter::add(rowBatch, offset, numValues);
-
     UnionVectorBatch* unionBatch = dynamic_cast<UnionVectorBatch*>(&rowBatch);
     if (unionBatch == nullptr) {
       throw InvalidArgument("Failed to cast to UnionVectorBatch");
     }
 
+    ColumnWriter::add(rowBatch, offset, numValues);
+
     const char* notNull = unionBatch->hasNulls ?
                           unionBatch->notNull.data() + offset : nullptr;
     unsigned char * tags = unionBatch->tags.data() + offset;
@@ -2632,14 +2674,16 @@ namespace orc {
       if (!notNull) {
         colIndexStatistics->increase(numValues);
       } else {
+        uint64_t count = 0;
         bool hasNull = false;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
-            colIndexStatistics->increase(1);
+            ++count;
           } else if (!hasNull) {
             hasNull = true;
           }
         }
+        colIndexStatistics->increase(count);
         if (hasNull) {
           colIndexStatistics->setHasNull(true);
         }

Reply via email to