This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new ecff9f1 ORC-824: [C++] Add column statistics for List and Map type
ecff9f1 is described below
commit ecff9f15241d5c35abd1ff641940ac6be3b6e3f5
Author: coderex2522 <[email protected]>
AuthorDate: Thu Jul 1 12:33:59 2021 +0800
ORC-824: [C++] Add column statistics for List and Map type
This closes #729
---
c++/include/orc/Statistics.hh | 68 ++++++++++++++++
c++/src/ColumnWriter.cc | 24 ++++--
c++/src/Statistics.cc | 34 +++++++-
c++/src/Statistics.hh | 165 +++++++++++++++++++++++++++++++++++++++
c++/test/TestColumnStatistics.cc | 54 +++++++++++++
5 files changed, 338 insertions(+), 7 deletions(-)
diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh
index c7b0781..4d7caea 100644
--- a/c++/include/orc/Statistics.hh
+++ b/c++/include/orc/Statistics.hh
@@ -384,6 +384,74 @@ namespace orc {
virtual uint32_t getNumberOfColumns() const = 0;
};
+ /**
+ * Statistics for all of collections such as Map and List.
+ */
+ class CollectionColumnStatistics : public ColumnStatistics {
+ public:
+ virtual ~CollectionColumnStatistics();
+
+ /**
+ * check whether column has minimum number of children
+ * @return true if has minimum children count
+ */
+ virtual bool hasMinimumChildren() const = 0;
+
+ /**
+ * check whether column has maximum number of children
+ * @return true if has maximum children count
+ */
+ virtual bool hasMaximumChildren() const = 0;
+
+ /**
+ * check whether column has total number of children
+ * @return true if has total children count
+ */
+ virtual bool hasTotalChildren() const = 0;
+
+ /**
+ * set hasTotalChildren value
+ * @param newHasTotalChildren hasTotalChildren value
+ */
+ virtual void setHasTotalChildren(bool newHasTotalChildren) = 0;
+
+ /**
+ * Get minimum number of children in the collection.
+ * @return the minimum children count
+ */
+ virtual uint64_t getMinimumChildren() const = 0;
+
+ /**
+ * set new minimum children count
+ * @param min new minimum children count
+ */
+ virtual void setMinimumChildren(uint64_t min) = 0;
+
+ /**
+ * Get maximum number of children in the collection.
+ * @return the maximum children count
+ */
+ virtual uint64_t getMaximumChildren() const = 0;
+
+ /**
+ * set new maximum children count
+ * @param max new maximum children count
+ */
+ virtual void setMaximumChildren(uint64_t max) = 0;
+
+ /**
+ * Get the total number of children in the collection.
+ * @return the total number of children
+ */
+ virtual uint64_t getTotalChildren() const = 0;
+
+ /**
+ * set new total children count
+ * @param newTotalChildrenCount total children count to be set
+ */
+ virtual void setTotalChildren(uint64_t newTotalChildrenCount) = 0;
+ };
+
class StripeStatistics : public Statistics {
public:
virtual ~StripeStatistics();
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index a259594..fd77d70 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -2262,6 +2262,11 @@ namespace orc {
if (listBatch == nullptr) {
throw InvalidArgument("Failed to cast to ListVectorBatch");
}
+ CollectionColumnStatisticsImpl* collectionStats =
+
dynamic_cast<CollectionColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (collectionStats == nullptr) {
+ throw InvalidArgument("Failed to cast to
CollectionColumnStatisticsImpl");
+ }
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
@@ -2285,20 +2290,21 @@ namespace orc {
if (enableIndex) {
if (!notNull) {
- colIndexStatistics->increase(numValues);
+ collectionStats->increase(numValues);
} else {
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
++count;
+ collectionStats->update(static_cast<uint64_t>(offsets[i]));
if (enableBloomFilter) {
bloomFilter->addLong(offsets[i]);
}
}
}
- colIndexStatistics->increase(count);
+ collectionStats->increase(count);
if (count < numValues) {
- colIndexStatistics->setHasNull(true);
+ collectionStats->setHasNull(true);
}
}
}
@@ -2488,6 +2494,11 @@ namespace orc {
if (mapBatch == nullptr) {
throw InvalidArgument("Failed to cast to MapVectorBatch");
}
+ CollectionColumnStatisticsImpl* collectionStats =
+
dynamic_cast<CollectionColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (collectionStats == nullptr) {
+ throw InvalidArgument("Failed to cast to
CollectionColumnStatisticsImpl");
+ }
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
@@ -2515,20 +2526,21 @@ namespace orc {
if (enableIndex) {
if (!notNull) {
- colIndexStatistics->increase(numValues);
+ collectionStats->increase(numValues);
} else {
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
++count;
+ collectionStats->update(static_cast<uint64_t>(offsets[i]));
if (enableBloomFilter) {
bloomFilter->addLong(offsets[i]);
}
}
}
- colIndexStatistics->increase(count);
+ collectionStats->increase(count);
if (count < numValues) {
- colIndexStatistics->setHasNull(true);
+ collectionStats->setHasNull(true);
}
}
}
diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc
index b0c9de8..53e0da2 100644
--- a/c++/src/Statistics.cc
+++ b/c++/src/Statistics.cc
@@ -30,6 +30,8 @@ namespace orc {
return new IntegerColumnStatisticsImpl(s);
} else if (s.has_doublestatistics()) {
return new DoubleColumnStatisticsImpl(s);
+ } else if (s.has_collectionstatistics()) {
+ return new CollectionColumnStatisticsImpl(s);
} else if (s.has_stringstatistics()) {
return new StringColumnStatisticsImpl(s, statContext);
} else if (s.has_bucketstatistics()) {
@@ -135,6 +137,10 @@ namespace orc {
// PASS
}
+ CollectionColumnStatistics::~CollectionColumnStatistics() {
+ // PASS
+ }
+
MutableColumnStatistics::~MutableColumnStatistics() {
// PASS
}
@@ -167,6 +173,10 @@ namespace orc {
// PASS
}
+ CollectionColumnStatisticsImpl::~CollectionColumnStatisticsImpl() {
+ // PASS
+ }
+
void IntegerColumnStatisticsImpl::update(int64_t value, int repetitions) {
_stats.updateMinMax(value);
@@ -381,6 +391,26 @@ namespace orc {
}
}
+ CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl
+ (const proto::ColumnStatistics& pb) {
+ _stats.setNumberOfValues(pb.numberofvalues());
+ _stats.setHasNull(pb.hasnull());
+ if (!pb.has_collectionstatistics()) {
+ _stats.setMinimum(0);
+ _stats.setMaximum(0);
+ _stats.setSum(0);
+ } else {
+ const proto::CollectionStatistics& stats = pb.collectionstatistics();
+ _stats.setHasMinimum(stats.has_minchildren());
+ _stats.setHasMaximum(stats.has_maxchildren());
+ _stats.setHasSum(stats.has_totalchildren());
+
+ _stats.setMinimum(stats.minchildren());
+ _stats.setMaximum(stats.maxchildren());
+ _stats.setSum(stats.totalchildren());
+ }
+ }
+
std::unique_ptr<MutableColumnStatistics> createColumnStatistics(
const Type& type) {
switch (static_cast<int64_t>(type.getKind())) {
@@ -393,9 +423,11 @@ namespace orc {
case SHORT:
return std::unique_ptr<MutableColumnStatistics>(
new IntegerColumnStatisticsImpl());
- case STRUCT:
case MAP:
case LIST:
+ return std::unique_ptr<MutableColumnStatistics>(
+ new CollectionColumnStatisticsImpl());
+ case STRUCT:
case UNION:
return std::unique_ptr<MutableColumnStatistics>(
new ColumnStatisticsImpl());
diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh
index 434a0ad..f056aba 100644
--- a/c++/src/Statistics.hh
+++ b/c++/src/Statistics.hh
@@ -173,6 +173,7 @@ namespace orc {
typedef InternalStatisticsImpl<double> InternalDoubleStatistics;
typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics;
typedef InternalStatisticsImpl<std::string> InternalStringStatistics;
+ typedef InternalStatisticsImpl<uint64_t> InternalCollectionStatistics;
/**
* Mutable column statistics for use by the writer.
@@ -1463,6 +1464,170 @@ namespace orc {
}
};
+ class CollectionColumnStatisticsImpl : public CollectionColumnStatistics,
+ public MutableColumnStatistics {
+ private:
+ InternalCollectionStatistics _stats;
+
+ public:
+ CollectionColumnStatisticsImpl() { reset(); }
+ CollectionColumnStatisticsImpl(const proto::ColumnStatistics &stats);
+ virtual ~CollectionColumnStatisticsImpl() override;
+
+ bool hasMinimumChildren() const override {
+ return _stats.hasMinimum();
+ }
+
+ bool hasMaximumChildren() const override {
+ return _stats.hasMaximum();
+ }
+
+ bool hasTotalChildren() const override {
+ return _stats.hasSum();
+ }
+
+ void increase(uint64_t count) override {
+ _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
+ }
+
+ uint64_t getNumberOfValues() const override {
+ return _stats.getNumberOfValues();
+ }
+
+ void setNumberOfValues(uint64_t value) override {
+ _stats.setNumberOfValues(value);
+ }
+
+ bool hasNull() const override {
+ return _stats.hasNull();
+ }
+
+ void setHasNull(bool hasNull) override {
+ _stats.setHasNull(hasNull);
+ }
+
+ uint64_t getMinimumChildren() const override {
+ if(hasMinimumChildren()) {
+ return _stats.getMinimum();
+ } else {
+ throw ParseError("MinimumChildren is not defined.");
+ }
+ }
+
+ uint64_t getMaximumChildren() const override {
+ if(hasMaximumChildren()) {
+ return _stats.getMaximum();
+ } else {
+ throw ParseError("MaximumChildren is not defined.");
+ }
+ }
+
+ uint64_t getTotalChildren() const override {
+ if(hasTotalChildren()) {
+ return _stats.getSum();
+ } else {
+ throw ParseError("TotalChildren is not defined.");
+ }
+ }
+
+ void setMinimumChildren(uint64_t minimum) override {
+ _stats.setHasMinimum(true);
+ _stats.setMinimum(minimum);
+ }
+
+ void setMaximumChildren(uint64_t maximum) override {
+ _stats.setHasMaximum(true);
+ _stats.setMaximum(maximum);
+ }
+
+ void setTotalChildren(uint64_t sum) override {
+ _stats.setHasSum(true);
+ _stats.setSum(sum);
+ }
+
+ void setHasTotalChildren(bool hasSum) override {
+ _stats.setHasSum(hasSum);
+ }
+
+ void merge(const MutableColumnStatistics& other) override {
+ const CollectionColumnStatisticsImpl& collectionStats =
+ dynamic_cast<const CollectionColumnStatisticsImpl&>(other);
+
+ _stats.merge(collectionStats._stats);
+
+ // hasSumValue here means no overflow
+ _stats.setHasSum(_stats.hasSum() && collectionStats.hasTotalChildren());
+ if (_stats.hasSum()) {
+ uint64_t oldSum = _stats.getSum();
+ _stats.setSum(_stats.getSum() + collectionStats.getTotalChildren());
+ if (oldSum > _stats.getSum()) {
+ _stats.setHasSum(false);
+ }
+ }
+ }
+
+ void reset() override {
+ _stats.reset();
+ setTotalChildren(0);
+ }
+
+ void update(uint64_t value) {
+ _stats.updateMinMax(value);
+ if (_stats.hasSum()) {
+ uint64_t oldSum = _stats.getSum();
+ _stats.setSum(_stats.getSum() + value);
+ if (oldSum > _stats.getSum()) {
+ _stats.setHasSum(false);
+ }
+ }
+ }
+
+ void toProtoBuf(proto::ColumnStatistics &pbStats) const override {
+ pbStats.set_hasnull(_stats.hasNull());
+ pbStats.set_numberofvalues(_stats.getNumberOfValues());
+
+ proto::CollectionStatistics* collectionStats =
+ pbStats.mutable_collectionstatistics();
+ if (_stats.hasMinimum()) {
+ collectionStats->set_minchildren(_stats.getMinimum());
+ collectionStats->set_maxchildren(_stats.getMaximum());
+ } else {
+ collectionStats->clear_minchildren();
+ collectionStats->clear_maxchildren();
+ }
+ if (_stats.hasSum()) {
+ collectionStats->set_totalchildren(_stats.getSum());
+ } else {
+ collectionStats->clear_totalchildren();
+ }
+ }
+
+ std::string toString() const override {
+ std::ostringstream buffer;
+ buffer << "Data type: Collection(LIST|MAP)" << std::endl
+ << "Values: " << getNumberOfValues() << std::endl
+ << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
+ if (hasMinimumChildren()) {
+ buffer << "MinChildren: " << getMinimumChildren() << std::endl;
+ } else {
+ buffer << "MinChildren is not defined" << std::endl;
+ }
+
+ if (hasMaximumChildren()) {
+ buffer << "MaxChildren: " << getMaximumChildren() << std::endl;
+ } else {
+ buffer << "MaxChildren is not defined" << std::endl;
+ }
+
+ if (hasTotalChildren()) {
+ buffer << "TotalChildren: " << getTotalChildren() << std::endl;
+ } else {
+ buffer << "TotalChildren is not defined" << std::endl;
+ }
+ return buffer.str();
+ }
+ };
+
ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
const StatContext& statContext);
diff --git a/c++/test/TestColumnStatistics.cc b/c++/test/TestColumnStatistics.cc
index 4d187b9..77f3e2d 100644
--- a/c++/test/TestColumnStatistics.cc
+++ b/c++/test/TestColumnStatistics.cc
@@ -490,4 +490,58 @@ namespace orc {
EXPECT_EQ(9999, tsStatsFromPb->getMaximumNanos());
}
+ TEST(ColumnStatistics, collectionColumnStatistics) {
+ std::unique_ptr<CollectionColumnStatisticsImpl> collectionStats(
+ new CollectionColumnStatisticsImpl());
+
+ // initial state
+ EXPECT_EQ(0, collectionStats->getNumberOfValues());
+ EXPECT_FALSE(collectionStats->hasNull());
+ EXPECT_FALSE(collectionStats->hasMinimumChildren());
+ EXPECT_FALSE(collectionStats->hasMaximumChildren());
+ EXPECT_TRUE(collectionStats->hasTotalChildren());
+ EXPECT_EQ(0, collectionStats->getTotalChildren());
+
+ // normal operations
+ collectionStats->increase(1);
+ EXPECT_EQ(1, collectionStats->getNumberOfValues());
+
+ collectionStats->increase(0);
+ EXPECT_EQ(1, collectionStats->getNumberOfValues());
+
+ collectionStats->increase(9999999999999999l);
+ EXPECT_EQ(10000000000000000l, collectionStats->getNumberOfValues());
+
+ collectionStats->update(10);
+ EXPECT_EQ(10, collectionStats->getMaximumChildren());
+ EXPECT_EQ(10, collectionStats->getMinimumChildren());
+
+ collectionStats->update(20);
+ EXPECT_EQ(20, collectionStats->getMaximumChildren());
+ EXPECT_EQ(10, collectionStats->getMinimumChildren());
+
+ EXPECT_EQ(30, collectionStats->getTotalChildren());
+ // test merge
+ std::unique_ptr<CollectionColumnStatisticsImpl> other(
+ new CollectionColumnStatisticsImpl());
+
+ other->update(40);
+ other->update(30);
+
+ collectionStats->merge(*other);
+ EXPECT_EQ(40, other->getMaximumChildren());
+ EXPECT_EQ(30, other->getMinimumChildren());
+ EXPECT_EQ(40, collectionStats->getMaximumChildren());
+ EXPECT_EQ(10, collectionStats->getMinimumChildren());
+ EXPECT_EQ(100, collectionStats->getTotalChildren());
+
+ // test overflow
+ other->update(std::numeric_limits<uint64_t>::max());
+ EXPECT_FALSE(other->hasTotalChildren());
+ // test merge overflow
+ other->setTotalChildren(std::numeric_limits<uint64_t>::max() - 50);
+ EXPECT_EQ(std::numeric_limits<uint64_t>::max() - 50,
other->getTotalChildren());
+ collectionStats->merge(*other);
+ EXPECT_FALSE(collectionStats->hasTotalChildren());
+ }
}