(cloudberry-backup) branch main updated: feat: support STATISTIC_KIND_NDV_BY_SEGMENTS in Cloudberry 2.1.0

djwang Sat, 28 Feb 2026 02:00:24 -0800

This is an automated email from the ASF dual-hosted git repository.

djwang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry-backup.git



The following commit(s) were added to refs/heads/main by this push:
     new 47171422 feat: support STATISTIC_KIND_NDV_BY_SEGMENTS in Cloudberry 
2.1.0
47171422 is described below

commit 47171422be883f500efcaffc53d370028153f458
Author: Robert Mu <[email protected]>
AuthorDate: Sat Feb 28 16:57:03 2026 +0800

    feat: support STATISTIC_KIND_NDV_BY_SEGMENTS in Cloudberry 2.1.0
    
    Cloudberry 2.1.0 introduced a new statistic kind (stakind=8) to store
    the sum of local NDVs across all segments. Unlike other statistic kinds,
    this always stores an int8 array regardless of the column's underlying
    data type.
---
 backup/statistics.go                   | 42 +++++++++++++++++++++-------------
 backup/statistics_test.go              |  8 +++++--
 integration/statistics_queries_test.go | 33 ++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/backup/statistics.go b/backup/statistics.go
index 5c63866b..5c1f0e86 100644
--- a/backup/statistics.go
+++ b/backup/statistics.go
@@ -14,6 +14,11 @@ import (
        "github.com/lib/pq"
 )
 
+const (
+       // STATISTIC_KIND_NDV_BY_SEGMENTS is specific to Cloudberry Database 
2.1.0+
+       STATISTIC_KIND_NDV_BY_SEGMENTS = 8
+)
+
 func PrintStatisticsStatements(statisticsFile *utils.FileWithByteCount, 
tocfile *toc.TOC, tables []Table, attStats map[uint32][]AttributeStatistic, 
tupleStats map[uint32]TupleStatistic) {
        for _, table := range tables {
                tupleQuery := GenerateTupleStatisticsQuery(table, 
tupleStats[table.Oid])
@@ -162,11 +167,11 @@ func generateAttributeSlotsQuery7(attStat 
AttributeStatistic) string {
                        realValues(attStat.Numbers3),
                        realValues(attStat.Numbers4),
                        realValues(attStat.Numbers5),
-                       AnyValues(attStat.Values1, attStat.Type),
-                       AnyValues(attStat.Values2, attStat.Type),
-                       AnyValues(attStat.Values3, attStat.Type),
-                       AnyValues(attStat.Values4, attStat.Type),
-                       AnyValues(attStat.Values5, attStat.Type))
+                       AnyValues(attStat.Values1, attStat.Type, attStat.Kind1),
+                       AnyValues(attStat.Values2, attStat.Type, attStat.Kind2),
+                       AnyValues(attStat.Values3, attStat.Type, attStat.Kind3),
+                       AnyValues(attStat.Values4, attStat.Type, attStat.Kind4),
+                       AnyValues(attStat.Values5, attStat.Type, attStat.Kind5))
        }
        return attributeQuery
 }
@@ -230,11 +235,11 @@ func generateAttributeSlotsQuery6(attStat 
AttributeStatistic) string {
                        realValues(attStat.Numbers3),
                        realValues(attStat.Numbers4),
                        realValues(attStat.Numbers5),
-                       AnyValues(attStat.Values1, attStat.Type),
-                       AnyValues(attStat.Values2, attStat.Type),
-                       AnyValues(attStat.Values3, attStat.Type),
-                       AnyValues(attStat.Values4, attStat.Type),
-                       AnyValues(attStat.Values5, attStat.Type))
+                       AnyValues(attStat.Values1, attStat.Type, attStat.Kind1),
+                       AnyValues(attStat.Values2, attStat.Type, attStat.Kind2),
+                       AnyValues(attStat.Values3, attStat.Type, attStat.Kind3),
+                       AnyValues(attStat.Values4, attStat.Type, attStat.Kind4),
+                       AnyValues(attStat.Values5, attStat.Type, attStat.Kind5))
        }
        return attributeQuery
 }
@@ -286,10 +291,10 @@ func generateAttributeSlotsQuery4(attStat 
AttributeStatistic) string {
                        realValues(attStat.Numbers2),
                        realValues(attStat.Numbers3),
                        realValues(attStat.Numbers4),
-                       AnyValues(attStat.Values1, attStat.Type),
-                       AnyValues(attStat.Values2, attStat.Type),
-                       AnyValues(attStat.Values3, attStat.Type),
-                       AnyValues(attStat.Values4, attStat.Type))
+                       AnyValues(attStat.Values1, attStat.Type, attStat.Kind1),
+                       AnyValues(attStat.Values2, attStat.Type, attStat.Kind2),
+                       AnyValues(attStat.Values3, attStat.Type, attStat.Kind3),
+                       AnyValues(attStat.Values4, attStat.Type, attStat.Kind4))
        }
        return attributeQuery
 }
@@ -317,10 +322,15 @@ func realValues(reals pq.StringArray) string {
 /*
  * A given type is not guaranteed to have a corresponding array type, so we 
need
  * to use array_in() instead of casting to an array.
+ * STATISTIC_KIND_NDV_BY_SEGMENTS (8) is a special case which stores an array 
of
+ * int8 values rather than the column's native type.
  */
-func AnyValues(any pq.StringArray, typ string) string {
+func AnyValues(any pq.StringArray, typ string, kind int) string {
        if len(any) > 0 {
+               if kind == STATISTIC_KIND_NDV_BY_SEGMENTS {
+                       return fmt.Sprintf(`array_in(%s, 'int8'::regtype::oid, 
-1)`, SliceToPostgresArray(any))
+               }
                return fmt.Sprintf(`array_in(%s, '%s'::regtype::oid, -1)`, 
SliceToPostgresArray(any), typ)
        }
-       return fmt.Sprintf("NULL")
+       return "NULL"
 }
diff --git a/backup/statistics_test.go b/backup/statistics_test.go
index 5d58c8e2..4aed8b03 100644
--- a/backup/statistics_test.go
+++ b/backup/statistics_test.go
@@ -235,11 +235,15 @@ WHERE oid = 
'"""test''schema"""."""test''table"""'::regclass::oid;`))
        })
        Describe("AnyValues", func() {
                It("returns properly casted string when length of anyvalues is 
greater than 0", func() {
-                       castedString := backup.AnyValues([]string{"1", "2"}, 
"int")
+                       castedString := backup.AnyValues([]string{"1", "2"}, 
"int", 1)
                        Expect(castedString).To(Equal(`array_in('{"1","2"}', 
'int'::regtype::oid, -1)`))
                })
+               It("returns int8 casted string when kind is 
STATISTIC_KIND_NDV_BY_SEGMENTS", func() {
+                       castedString := backup.AnyValues([]string{"2"}, "bool", 
backup.STATISTIC_KIND_NDV_BY_SEGMENTS)
+                       Expect(castedString).To(Equal(`array_in('{"2"}', 
'int8'::regtype::oid, -1)`))
+               })
                It("returns NULL if anyvalues is of length 0", func() {
-                       castedString := backup.AnyValues([]string{}, "int")
+                       castedString := backup.AnyValues([]string{}, "int", 1)
                        Expect(castedString).To(Equal(`NULL`))
                })
        })
diff --git a/integration/statistics_queries_test.go 
b/integration/statistics_queries_test.go
index a6932d61..b7de66e6 100644
--- a/integration/statistics_queries_test.go
+++ b/integration/statistics_queries_test.go
@@ -56,6 +56,39 @@ var _ = Describe("backup integration tests", func() {
                                expectedStats5J.Collation1 = 100
                                expectedStats5J.Collation2 = 100
                        }
+                       if connectionPool.Version.IsCBDB() && 
connectionPool.Version.AtLeast("2.1.0") {
+                               // Cloudberry Database 2.1.0 introduced 
STATISTIC_KIND_NDV_BY_SEGMENTS (8).
+                               // In this test case, due to the small data 
volume, this statistic is
+                               // automatically placed into the 3rd slot 
(stakind3) by the analyze command.
+                               expectedStats5I.Kind3 = 
backup.STATISTIC_KIND_NDV_BY_SEGMENTS
+                               expectedStats5J.Kind3 = 
backup.STATISTIC_KIND_NDV_BY_SEGMENTS
+                               expectedStats5K.Kind3 = 
backup.STATISTIC_KIND_NDV_BY_SEGMENTS
+
+                               // Set the operator OID for this new statistic 
kind
+                               // i (int4) uses operator 97 (=)
+                               // j (text) uses operator 664 (=) and collation 
100
+                               // k (bool) uses operator 58 (=)
+                               expectedStats5I.Operator3 = 97
+                               expectedStats5J.Operator3 = 664
+                               expectedStats5J.Collation3 = 100
+                               expectedStats5K.Operator3 = 58
+
+                               // 4 distinct rows were inserted for 'i' (int) 
and 'j' (text) columns
+                               expectedStats5I.Values3 = []string{"4"}
+                               expectedStats5J.Values3 = []string{"4"}
+
+                               // Why is 'k' (bool) 3.0000000596046448 instead 
of 2?
+                               // 1. STATISTIC_KIND_NDV_BY_SEGMENTS (8) is the 
SUM of local NDVs across all segments, NOT the global NDV.
+                               // 2. Based on the hash distribution (using 'i' 
as distribution key), the rows map to segments like so:
+                               //    - Seg 0 gets 3 rows: (2,b,f), (3,c,t), 
(4,d,f). Local NDV for 'k' on Seg 0 = 2 ('f' and 't')
+                               //    - Seg 1 gets 1 row: (1,a,t). Local NDV 
for 'k' on Seg 1 = 1 ('t')
+                               //    - Seg 2 gets 0 rows. Local NDV for 'k' on 
Seg 2 = 0
+                               //    - Sum of Local NDVs = 2 + 1 + 0 = 3
+                               // 3. The optimizer uses this to estimate 
intermediate rows generated during a two-stage aggregation (Partial Agg).
+                               // 4. The value is stored internally as a 
float4 (single precision) to save space, and when retrieved,
+                               //    it is converted back to double precision 
(float8), resulting in the slight precision loss (3.0000000596046448).
+                               expectedStats5K.Values3 = 
[]string{"3.0000000596046448"}
+                       }
 
                        // The order in which the stavalues1 values is returned 
is not guaranteed to be deterministic
                        sort.Strings(tableAttStatsI.Values1)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry-backup) branch main updated: feat: support STATISTIC_KIND_NDV_BY_SEGMENTS in Cloudberry 2.1.0

Reply via email to