This is an automated email from the ASF dual-hosted git repository.
djwang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry-backup.git
The following commit(s) were added to refs/heads/main by this push:
new 47171422 feat: support STATISTIC_KIND_NDV_BY_SEGMENTS in Cloudberry
2.1.0
47171422 is described below
commit 47171422be883f500efcaffc53d370028153f458
Author: Robert Mu <[email protected]>
AuthorDate: Sat Feb 28 16:57:03 2026 +0800
feat: support STATISTIC_KIND_NDV_BY_SEGMENTS in Cloudberry 2.1.0
Cloudberry 2.1.0 introduced a new statistic kind (stakind=8) to store
the sum of local NDVs across all segments. Unlike other statistic kinds,
this always stores an int8 array regardless of the column's underlying
data type.
---
backup/statistics.go | 42 +++++++++++++++++++++-------------
backup/statistics_test.go | 8 +++++--
integration/statistics_queries_test.go | 33 ++++++++++++++++++++++++++
3 files changed, 65 insertions(+), 18 deletions(-)
diff --git a/backup/statistics.go b/backup/statistics.go
index 5c63866b..5c1f0e86 100644
--- a/backup/statistics.go
+++ b/backup/statistics.go
@@ -14,6 +14,11 @@ import (
"github.com/lib/pq"
)
+const (
+ // STATISTIC_KIND_NDV_BY_SEGMENTS is specific to Cloudberry Database
2.1.0+
+ STATISTIC_KIND_NDV_BY_SEGMENTS = 8
+)
+
func PrintStatisticsStatements(statisticsFile *utils.FileWithByteCount,
tocfile *toc.TOC, tables []Table, attStats map[uint32][]AttributeStatistic,
tupleStats map[uint32]TupleStatistic) {
for _, table := range tables {
tupleQuery := GenerateTupleStatisticsQuery(table,
tupleStats[table.Oid])
@@ -162,11 +167,11 @@ func generateAttributeSlotsQuery7(attStat
AttributeStatistic) string {
realValues(attStat.Numbers3),
realValues(attStat.Numbers4),
realValues(attStat.Numbers5),
- AnyValues(attStat.Values1, attStat.Type),
- AnyValues(attStat.Values2, attStat.Type),
- AnyValues(attStat.Values3, attStat.Type),
- AnyValues(attStat.Values4, attStat.Type),
- AnyValues(attStat.Values5, attStat.Type))
+ AnyValues(attStat.Values1, attStat.Type, attStat.Kind1),
+ AnyValues(attStat.Values2, attStat.Type, attStat.Kind2),
+ AnyValues(attStat.Values3, attStat.Type, attStat.Kind3),
+ AnyValues(attStat.Values4, attStat.Type, attStat.Kind4),
+ AnyValues(attStat.Values5, attStat.Type, attStat.Kind5))
}
return attributeQuery
}
@@ -230,11 +235,11 @@ func generateAttributeSlotsQuery6(attStat
AttributeStatistic) string {
realValues(attStat.Numbers3),
realValues(attStat.Numbers4),
realValues(attStat.Numbers5),
- AnyValues(attStat.Values1, attStat.Type),
- AnyValues(attStat.Values2, attStat.Type),
- AnyValues(attStat.Values3, attStat.Type),
- AnyValues(attStat.Values4, attStat.Type),
- AnyValues(attStat.Values5, attStat.Type))
+ AnyValues(attStat.Values1, attStat.Type, attStat.Kind1),
+ AnyValues(attStat.Values2, attStat.Type, attStat.Kind2),
+ AnyValues(attStat.Values3, attStat.Type, attStat.Kind3),
+ AnyValues(attStat.Values4, attStat.Type, attStat.Kind4),
+ AnyValues(attStat.Values5, attStat.Type, attStat.Kind5))
}
return attributeQuery
}
@@ -286,10 +291,10 @@ func generateAttributeSlotsQuery4(attStat
AttributeStatistic) string {
realValues(attStat.Numbers2),
realValues(attStat.Numbers3),
realValues(attStat.Numbers4),
- AnyValues(attStat.Values1, attStat.Type),
- AnyValues(attStat.Values2, attStat.Type),
- AnyValues(attStat.Values3, attStat.Type),
- AnyValues(attStat.Values4, attStat.Type))
+ AnyValues(attStat.Values1, attStat.Type, attStat.Kind1),
+ AnyValues(attStat.Values2, attStat.Type, attStat.Kind2),
+ AnyValues(attStat.Values3, attStat.Type, attStat.Kind3),
+ AnyValues(attStat.Values4, attStat.Type, attStat.Kind4))
}
return attributeQuery
}
@@ -317,10 +322,15 @@ func realValues(reals pq.StringArray) string {
/*
* A given type is not guaranteed to have a corresponding array type, so we
need
* to use array_in() instead of casting to an array.
+ * STATISTIC_KIND_NDV_BY_SEGMENTS (8) is a special case which stores an array
of
+ * int8 values rather than the column's native type.
*/
-func AnyValues(any pq.StringArray, typ string) string {
+func AnyValues(any pq.StringArray, typ string, kind int) string {
if len(any) > 0 {
+ if kind == STATISTIC_KIND_NDV_BY_SEGMENTS {
+ return fmt.Sprintf(`array_in(%s, 'int8'::regtype::oid,
-1)`, SliceToPostgresArray(any))
+ }
return fmt.Sprintf(`array_in(%s, '%s'::regtype::oid, -1)`,
SliceToPostgresArray(any), typ)
}
- return fmt.Sprintf("NULL")
+ return "NULL"
}
diff --git a/backup/statistics_test.go b/backup/statistics_test.go
index 5d58c8e2..4aed8b03 100644
--- a/backup/statistics_test.go
+++ b/backup/statistics_test.go
@@ -235,11 +235,15 @@ WHERE oid =
'"""test''schema"""."""test''table"""'::regclass::oid;`))
})
Describe("AnyValues", func() {
It("returns properly casted string when length of anyvalues is
greater than 0", func() {
- castedString := backup.AnyValues([]string{"1", "2"},
"int")
+ castedString := backup.AnyValues([]string{"1", "2"},
"int", 1)
Expect(castedString).To(Equal(`array_in('{"1","2"}',
'int'::regtype::oid, -1)`))
})
+ It("returns int8 casted string when kind is
STATISTIC_KIND_NDV_BY_SEGMENTS", func() {
+ castedString := backup.AnyValues([]string{"2"}, "bool",
backup.STATISTIC_KIND_NDV_BY_SEGMENTS)
+ Expect(castedString).To(Equal(`array_in('{"2"}',
'int8'::regtype::oid, -1)`))
+ })
It("returns NULL if anyvalues is of length 0", func() {
- castedString := backup.AnyValues([]string{}, "int")
+ castedString := backup.AnyValues([]string{}, "int", 1)
Expect(castedString).To(Equal(`NULL`))
})
})
diff --git a/integration/statistics_queries_test.go
b/integration/statistics_queries_test.go
index a6932d61..b7de66e6 100644
--- a/integration/statistics_queries_test.go
+++ b/integration/statistics_queries_test.go
@@ -56,6 +56,39 @@ var _ = Describe("backup integration tests", func() {
expectedStats5J.Collation1 = 100
expectedStats5J.Collation2 = 100
}
+ if connectionPool.Version.IsCBDB() &&
connectionPool.Version.AtLeast("2.1.0") {
+ // Cloudberry Database 2.1.0 introduced
STATISTIC_KIND_NDV_BY_SEGMENTS (8).
+ // In this test case, due to the small data
volume, this statistic is
+ // automatically placed into the 3rd slot
(stakind3) by the analyze command.
+ expectedStats5I.Kind3 =
backup.STATISTIC_KIND_NDV_BY_SEGMENTS
+ expectedStats5J.Kind3 =
backup.STATISTIC_KIND_NDV_BY_SEGMENTS
+ expectedStats5K.Kind3 =
backup.STATISTIC_KIND_NDV_BY_SEGMENTS
+
+ // Set the operator OID for this new statistic
kind
+ // i (int4) uses operator 97 (=)
+ // j (text) uses operator 664 (=) and collation
100
+ // k (bool) uses operator 58 (=)
+ expectedStats5I.Operator3 = 97
+ expectedStats5J.Operator3 = 664
+ expectedStats5J.Collation3 = 100
+ expectedStats5K.Operator3 = 58
+
+ // 4 distinct rows were inserted for 'i' (int)
and 'j' (text) columns
+ expectedStats5I.Values3 = []string{"4"}
+ expectedStats5J.Values3 = []string{"4"}
+
+ // Why is 'k' (bool) 3.0000000596046448 instead
of 2?
+ // 1. STATISTIC_KIND_NDV_BY_SEGMENTS (8) is the
SUM of local NDVs across all segments, NOT the global NDV.
+ // 2. Based on the hash distribution (using 'i'
as distribution key), the rows map to segments like so:
+ // - Seg 0 gets 3 rows: (2,b,f), (3,c,t),
(4,d,f). Local NDV for 'k' on Seg 0 = 2 ('f' and 't')
+ // - Seg 1 gets 1 row: (1,a,t). Local NDV
for 'k' on Seg 1 = 1 ('t')
+ // - Seg 2 gets 0 rows. Local NDV for 'k' on
Seg 2 = 0
+ // - Sum of Local NDVs = 2 + 1 + 0 = 3
+ // 3. The optimizer uses this to estimate
intermediate rows generated during a two-stage aggregation (Partial Agg).
+ // 4. The value is stored internally as a
float4 (single precision) to save space, and when retrieved,
+ // it is converted back to double precision
(float8), resulting in the slight precision loss (3.0000000596046448).
+ expectedStats5K.Values3 =
[]string{"3.0000000596046448"}
+ }
// The order in which the stavalues1 values is returned
is not guaranteed to be deterministic
sort.Strings(tableAttStatsI.Values1)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]