This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new ecff9f1  ORC-824: [C++] Add column statistics for List and Map type
ecff9f1 is described below

commit ecff9f15241d5c35abd1ff641940ac6be3b6e3f5
Author: coderex2522 <[email protected]>
AuthorDate: Thu Jul 1 12:33:59 2021 +0800

    ORC-824: [C++] Add column statistics for List and Map type
    
    This closes #729
---
 c++/include/orc/Statistics.hh    |  68 ++++++++++++++++
 c++/src/ColumnWriter.cc          |  24 ++++--
 c++/src/Statistics.cc            |  34 +++++++-
 c++/src/Statistics.hh            | 165 +++++++++++++++++++++++++++++++++++++++
 c++/test/TestColumnStatistics.cc |  54 +++++++++++++
 5 files changed, 338 insertions(+), 7 deletions(-)

diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh
index c7b0781..4d7caea 100644
--- a/c++/include/orc/Statistics.hh
+++ b/c++/include/orc/Statistics.hh
@@ -384,6 +384,74 @@ namespace orc {
     virtual uint32_t getNumberOfColumns() const = 0;
   };
 
+  /**
+   * Statistics for all of collections such as Map and List.
+   */
+  class CollectionColumnStatistics : public ColumnStatistics {
+  public:
+    virtual ~CollectionColumnStatistics();
+
+    /**
+     * check whether column has minimum number of children
+     * @return true if has minimum children count
+     */
+    virtual bool hasMinimumChildren() const = 0;
+
+    /**
+     * check whether column has maximum number of children
+     * @return true if has maximum children count
+     */
+    virtual bool hasMaximumChildren() const = 0;
+
+    /**
+     * check whether column has total number of children
+     * @return true if has total children count
+     */
+    virtual bool hasTotalChildren() const = 0;
+
+    /**
+     * set hasTotalChildren value
+     * @param newHasTotalChildren hasTotalChildren value
+     */
+    virtual void setHasTotalChildren(bool newHasTotalChildren) = 0;
+
+    /**
+     * Get minimum number of children in the collection.
+     * @return the minimum children count
+     */
+    virtual uint64_t getMinimumChildren() const = 0;
+
+    /**
+     * set new minimum children count
+     * @param min new minimum children count
+     */
+    virtual void setMinimumChildren(uint64_t min) = 0;
+
+    /**
+     * Get maximum number of children in the collection.
+     * @return the maximum children count
+     */
+    virtual uint64_t getMaximumChildren() const = 0;
+
+    /**
+     * set new maximum children count
+     * @param max new maximum children count
+     */
+    virtual void setMaximumChildren(uint64_t max) = 0;
+
+    /**
+     * Get the total number of children in the collection.
+     * @return the total number of children
+     */
+    virtual uint64_t getTotalChildren() const = 0;
+
+    /**
+     * set new total children count
+     * @param newTotalChildrenCount total children count to be set
+     */
+    virtual void setTotalChildren(uint64_t newTotalChildrenCount) = 0;
+  };
+
   class StripeStatistics : public Statistics {
   public:
     virtual ~StripeStatistics();
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index a259594..fd77d70 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -2262,6 +2262,11 @@ namespace orc {
     if (listBatch == nullptr) {
       throw InvalidArgument("Failed to cast to ListVectorBatch");
     }
+    CollectionColumnStatisticsImpl* collectionStats =
+        
dynamic_cast<CollectionColumnStatisticsImpl*>(colIndexStatistics.get());
+    if (collectionStats == nullptr) {
+      throw InvalidArgument("Failed to cast to 
CollectionColumnStatisticsImpl");
+    }
 
     ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
 
@@ -2285,20 +2290,21 @@ namespace orc {
 
     if (enableIndex) {
       if (!notNull) {
-        colIndexStatistics->increase(numValues);
+        collectionStats->increase(numValues);
       } else {
         uint64_t count = 0;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
             ++count;
+            collectionStats->update(static_cast<uint64_t>(offsets[i]));
             if (enableBloomFilter) {
               bloomFilter->addLong(offsets[i]);
             }
           }
         }
-        colIndexStatistics->increase(count);
+        collectionStats->increase(count);
         if (count < numValues) {
-          colIndexStatistics->setHasNull(true);
+          collectionStats->setHasNull(true);
         }
       }
     }
@@ -2488,6 +2494,11 @@ namespace orc {
     if (mapBatch == nullptr) {
       throw InvalidArgument("Failed to cast to MapVectorBatch");
     }
+    CollectionColumnStatisticsImpl* collectionStats =
+        
dynamic_cast<CollectionColumnStatisticsImpl*>(colIndexStatistics.get());
+    if (collectionStats == nullptr) {
+      throw InvalidArgument("Failed to cast to 
CollectionColumnStatisticsImpl");
+    }
 
     ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
 
@@ -2515,20 +2526,21 @@ namespace orc {
 
     if (enableIndex) {
       if (!notNull) {
-        colIndexStatistics->increase(numValues);
+        collectionStats->increase(numValues);
       } else {
         uint64_t count = 0;
         for (uint64_t i = 0; i < numValues; ++i) {
           if (notNull[i]) {
             ++count;
+            collectionStats->update(static_cast<uint64_t>(offsets[i]));
             if (enableBloomFilter) {
               bloomFilter->addLong(offsets[i]);
             }
           }
         }
-        colIndexStatistics->increase(count);
+        collectionStats->increase(count);
         if (count < numValues) {
-          colIndexStatistics->setHasNull(true);
+          collectionStats->setHasNull(true);
         }
       }
     }
diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc
index b0c9de8..53e0da2 100644
--- a/c++/src/Statistics.cc
+++ b/c++/src/Statistics.cc
@@ -30,6 +30,8 @@ namespace orc {
       return new IntegerColumnStatisticsImpl(s);
     } else if (s.has_doublestatistics()) {
       return new DoubleColumnStatisticsImpl(s);
+    } else if (s.has_collectionstatistics()) {
+      return new CollectionColumnStatisticsImpl(s);
     } else if (s.has_stringstatistics()) {
       return new StringColumnStatisticsImpl(s, statContext);
     } else if (s.has_bucketstatistics()) {
@@ -135,6 +137,10 @@ namespace orc {
     // PASS
   }
 
+  CollectionColumnStatistics::~CollectionColumnStatistics() {
+    // PASS
+  }
+
   MutableColumnStatistics::~MutableColumnStatistics() {
     // PASS
   }
@@ -167,6 +173,10 @@ namespace orc {
     // PASS
   }
 
+  CollectionColumnStatisticsImpl::~CollectionColumnStatisticsImpl() {
+    // PASS
+  }
+
   void IntegerColumnStatisticsImpl::update(int64_t value, int repetitions) {
     _stats.updateMinMax(value);
 
@@ -381,6 +391,26 @@ namespace orc {
     }
   }
 
+  CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl
+  (const proto::ColumnStatistics& pb) {
+    _stats.setNumberOfValues(pb.numberofvalues());
+    _stats.setHasNull(pb.hasnull());
+    if (!pb.has_collectionstatistics()) {
+      _stats.setMinimum(0);
+      _stats.setMaximum(0);
+      _stats.setSum(0);
+    } else {
+      const proto::CollectionStatistics& stats = pb.collectionstatistics();
+      _stats.setHasMinimum(stats.has_minchildren());
+      _stats.setHasMaximum(stats.has_maxchildren());
+      _stats.setHasSum(stats.has_totalchildren());
+
+      _stats.setMinimum(stats.minchildren());
+      _stats.setMaximum(stats.maxchildren());
+      _stats.setSum(stats.totalchildren());
+    }
+  }
+
   std::unique_ptr<MutableColumnStatistics> createColumnStatistics(
     const Type& type) {
     switch (static_cast<int64_t>(type.getKind())) {
@@ -393,9 +423,11 @@ namespace orc {
       case SHORT:
         return std::unique_ptr<MutableColumnStatistics>(
           new IntegerColumnStatisticsImpl());
-      case STRUCT:
       case MAP:
       case LIST:
+        return std::unique_ptr<MutableColumnStatistics>(
+          new CollectionColumnStatisticsImpl());
+      case STRUCT:
       case UNION:
         return std::unique_ptr<MutableColumnStatistics>(
           new ColumnStatisticsImpl());
diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh
index 434a0ad..f056aba 100644
--- a/c++/src/Statistics.hh
+++ b/c++/src/Statistics.hh
@@ -173,6 +173,7 @@ namespace orc {
   typedef InternalStatisticsImpl<double> InternalDoubleStatistics;
   typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics;
   typedef InternalStatisticsImpl<std::string> InternalStringStatistics;
+  typedef InternalStatisticsImpl<uint64_t> InternalCollectionStatistics;
 
   /**
    * Mutable column statistics for use by the writer.
@@ -1463,6 +1464,170 @@ namespace orc {
     }
   };
 
+  class CollectionColumnStatisticsImpl : public CollectionColumnStatistics,
+                                         public MutableColumnStatistics {
+  private:
+    InternalCollectionStatistics _stats;
+
+  public:
+    CollectionColumnStatisticsImpl() { reset(); }
+    CollectionColumnStatisticsImpl(const proto::ColumnStatistics &stats);
+    virtual ~CollectionColumnStatisticsImpl() override;
+
+    bool hasMinimumChildren() const override {
+      return _stats.hasMinimum();
+    }
+
+    bool hasMaximumChildren() const override {
+      return _stats.hasMaximum();
+    }
+
+    bool hasTotalChildren() const override {
+      return _stats.hasSum();
+    }
+
+    void increase(uint64_t count) override {
+      _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+    }
+
+    uint64_t getNumberOfValues() const override {
+      return _stats.getNumberOfValues();
+    }
+
+    void setNumberOfValues(uint64_t value) override {
+      _stats.setNumberOfValues(value);
+    }
+
+    bool hasNull() const override {
+      return _stats.hasNull();
+    }
+
+    void setHasNull(bool hasNull) override {
+      _stats.setHasNull(hasNull);
+    }
+
+    uint64_t getMinimumChildren() const override {
+      if(hasMinimumChildren()) {
+        return _stats.getMinimum();
+      } else {
+        throw ParseError("MinimumChildren is not defined.");
+      }
+    }
+
+    uint64_t getMaximumChildren() const override {
+      if(hasMaximumChildren()) {
+        return _stats.getMaximum();
+      } else {
+        throw ParseError("MaximumChildren is not defined.");
+      }
+    }
+
+    uint64_t getTotalChildren() const override {
+      if(hasTotalChildren()) {
+        return _stats.getSum();
+      } else {
+        throw ParseError("TotalChildren is not defined.");
+      }
+    }
+
+    void setMinimumChildren(uint64_t minimum) override {
+      _stats.setHasMinimum(true);
+      _stats.setMinimum(minimum);
+    }
+
+    void setMaximumChildren(uint64_t maximum) override {
+      _stats.setHasMaximum(true);
+      _stats.setMaximum(maximum);
+    }
+
+    void setTotalChildren(uint64_t sum) override {
+      _stats.setHasSum(true);
+      _stats.setSum(sum);
+    }
+
+    void setHasTotalChildren(bool hasSum) override {
+      _stats.setHasSum(hasSum);
+    }
+
+    void merge(const MutableColumnStatistics& other) override {
+      const CollectionColumnStatisticsImpl& collectionStats =
+          dynamic_cast<const CollectionColumnStatisticsImpl&>(other);
+
+      _stats.merge(collectionStats._stats);
+
+      // hasSumValue here means no overflow
+      _stats.setHasSum(_stats.hasSum() && collectionStats.hasTotalChildren());
+      if (_stats.hasSum()) {
+        uint64_t oldSum = _stats.getSum();
+        _stats.setSum(_stats.getSum() + collectionStats.getTotalChildren());
+        if (oldSum > _stats.getSum()) {
+          _stats.setHasSum(false);
+        }
+      }
+    }
+
+    void reset() override {
+      _stats.reset();
+      setTotalChildren(0);
+    }
+
+    void update(uint64_t value) {
+      _stats.updateMinMax(value);
+      if (_stats.hasSum()) {
+        uint64_t oldSum = _stats.getSum();
+        _stats.setSum(_stats.getSum() + value);
+        if (oldSum > _stats.getSum()) {
+          _stats.setHasSum(false);
+        }
+      }
+    }
+
+    void toProtoBuf(proto::ColumnStatistics &pbStats) const override {
+      pbStats.set_hasnull(_stats.hasNull());
+      pbStats.set_numberofvalues(_stats.getNumberOfValues());
+
+      proto::CollectionStatistics* collectionStats =
+          pbStats.mutable_collectionstatistics();
+      if (_stats.hasMinimum()) {
+        collectionStats->set_minchildren(_stats.getMinimum());
+        collectionStats->set_maxchildren(_stats.getMaximum());
+      } else {
+        collectionStats->clear_minchildren();
+        collectionStats->clear_maxchildren();
+      }
+      if (_stats.hasSum()) {
+        collectionStats->set_totalchildren(_stats.getSum());
+      } else {
+        collectionStats->clear_totalchildren();
+      }
+    }
+
+    std::string toString() const override {
+      std::ostringstream buffer;
+      buffer << "Data type: Collection(LIST|MAP)" << std::endl
+            << "Values: " << getNumberOfValues() << std::endl
+            << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+      if (hasMinimumChildren()) {
+        buffer << "MinChildren: " << getMinimumChildren() << std::endl;
+      } else {
+        buffer << "MinChildren is not defined" << std::endl;
+      }
+
+      if (hasMaximumChildren()) {
+        buffer << "MaxChildren: " << getMaximumChildren() << std::endl;
+      } else {
+        buffer << "MaxChildren is not defined" << std::endl;
+      }
+
+      if (hasTotalChildren()) {
+        buffer << "TotalChildren: " << getTotalChildren() << std::endl;
+      } else {
+        buffer << "TotalChildren is not defined" << std::endl;
+      }
+      return buffer.str();
+    }
+  };
+
   ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
                                             const StatContext& statContext);
 
diff --git a/c++/test/TestColumnStatistics.cc b/c++/test/TestColumnStatistics.cc
index 4d187b9..77f3e2d 100644
--- a/c++/test/TestColumnStatistics.cc
+++ b/c++/test/TestColumnStatistics.cc
@@ -490,4 +490,58 @@ namespace orc {
     EXPECT_EQ(9999, tsStatsFromPb->getMaximumNanos());
   }
 
+  TEST(ColumnStatistics, collectionColumnStatistics) {
+    std::unique_ptr<CollectionColumnStatisticsImpl> collectionStats(
+      new CollectionColumnStatisticsImpl());
+
+    // initial state
+    EXPECT_EQ(0, collectionStats->getNumberOfValues());
+    EXPECT_FALSE(collectionStats->hasNull());
+    EXPECT_FALSE(collectionStats->hasMinimumChildren());
+    EXPECT_FALSE(collectionStats->hasMaximumChildren());
+    EXPECT_TRUE(collectionStats->hasTotalChildren());
+    EXPECT_EQ(0, collectionStats->getTotalChildren());
+
+    // normal operations
+    collectionStats->increase(1);
+    EXPECT_EQ(1, collectionStats->getNumberOfValues());
+
+    collectionStats->increase(0);
+    EXPECT_EQ(1, collectionStats->getNumberOfValues());
+
+    collectionStats->increase(9999999999999999l);
+    EXPECT_EQ(10000000000000000l, collectionStats->getNumberOfValues());
+
+    collectionStats->update(10);
+    EXPECT_EQ(10, collectionStats->getMaximumChildren());
+    EXPECT_EQ(10, collectionStats->getMinimumChildren());
+
+    collectionStats->update(20);
+    EXPECT_EQ(20, collectionStats->getMaximumChildren());
+    EXPECT_EQ(10, collectionStats->getMinimumChildren());
+
+    EXPECT_EQ(30, collectionStats->getTotalChildren());
+    // test merge
+    std::unique_ptr<CollectionColumnStatisticsImpl> other(
+      new CollectionColumnStatisticsImpl());
+
+    other->update(40);
+    other->update(30);
+
+    collectionStats->merge(*other);
+    EXPECT_EQ(40, other->getMaximumChildren());
+    EXPECT_EQ(30, other->getMinimumChildren());
+    EXPECT_EQ(40, collectionStats->getMaximumChildren());
+    EXPECT_EQ(10, collectionStats->getMinimumChildren());
+    EXPECT_EQ(100, collectionStats->getTotalChildren());
+
+    // test overflow
+    other->update(std::numeric_limits<uint64_t>::max());
+    EXPECT_FALSE(other->hasTotalChildren());
+    // test merge overflow
+    other->setTotalChildren(std::numeric_limits<uint64_t>::max() - 50);
+    EXPECT_EQ(std::numeric_limits<uint64_t>::max() - 50, 
other->getTotalChildren());
+    collectionStats->merge(*other);
+    EXPECT_FALSE(collectionStats->hasTotalChildren());
+  }
 }

Reply via email to